root/maint/gnulib/lib/striconveh.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. iconveh_open
  2. iconveh_close
  3. iconv_carefully
  4. iconv_carefully_1
  5. utf8conv_carefully
  6. mem_cd_iconveh_internal
  7. mem_cd_iconveh
  8. str_cd_iconveh
  9. mem_iconveh
  10. str_iconveh

   1 /* Character set conversion with error handling.
   2    Copyright (C) 2001-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible and Simon Josefsson.
   4 
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9 
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14 
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17 
  18 #include <config.h>
  19 
  20 /* Specification.  */
  21 #include "striconveh.h"
  22 
  23 #include <errno.h>
  24 #include <stdbool.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 
  28 #if HAVE_ICONV
  29 # include <iconv.h>
  30 # include "unistr.h"
  31 #endif
  32 
  33 #include "c-strcase.h"
  34 #include "c-strcaseeq.h"
  35 
  36 #ifndef SIZE_MAX
  37 # define SIZE_MAX ((size_t) -1)
  38 #endif
  39 
  40 
  41 #if HAVE_ICONV
  42 
  43 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  44    conversion error occurs, we may have to determine the Unicode representation
  45    of the inconvertible character.  */
  46 
  47 int
  48 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
     /* [previous][next][first][last][top][bottom][index][help] */
  49 {
  50   iconv_t cd;
  51   iconv_t cd1;
  52   iconv_t cd2;
  53 
  54   /* Avoid glibc-2.1 bug with EUC-KR.  */
  55 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  56      && !defined _LIBICONV_VERSION
  57   if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  58       || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  59     {
  60       errno = EINVAL;
  61       return -1;
  62     }
  63 # endif
  64 
  65   cd = iconv_open (to_codeset, from_codeset);
  66 
  67   if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  68     cd1 = (iconv_t)(-1);
  69   else
  70     {
  71       cd1 = iconv_open ("UTF-8", from_codeset);
  72       if (cd1 == (iconv_t)(-1))
  73         {
  74           int saved_errno = errno;
  75           if (cd != (iconv_t)(-1))
  76             iconv_close (cd);
  77           errno = saved_errno;
  78           return -1;
  79         }
  80     }
  81 
  82   if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  83 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  84       && !defined __UCLIBC__) \
  85      || _LIBICONV_VERSION >= 0x0105
  86       || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  87 # endif
  88      )
  89     cd2 = (iconv_t)(-1);
  90   else
  91     {
  92       cd2 = iconv_open (to_codeset, "UTF-8");
  93       if (cd2 == (iconv_t)(-1))
  94         {
  95           int saved_errno = errno;
  96           if (cd1 != (iconv_t)(-1))
  97             iconv_close (cd1);
  98           if (cd != (iconv_t)(-1))
  99             iconv_close (cd);
 100           errno = saved_errno;
 101           return -1;
 102         }
 103     }
 104 
 105   cdp->cd = cd;
 106   cdp->cd1 = cd1;
 107   cdp->cd2 = cd2;
 108   return 0;
 109 }
 110 
 111 int
 112 iconveh_close (const iconveh_t *cd)
     /* [previous][next][first][last][top][bottom][index][help] */
 113 {
 114   if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
 115     {
 116       /* Return -1, but preserve the errno from iconv_close.  */
 117       int saved_errno = errno;
 118       if (cd->cd1 != (iconv_t)(-1))
 119         iconv_close (cd->cd1);
 120       if (cd->cd != (iconv_t)(-1))
 121         iconv_close (cd->cd);
 122       errno = saved_errno;
 123       return -1;
 124     }
 125   if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
 126     {
 127       /* Return -1, but preserve the errno from iconv_close.  */
 128       int saved_errno = errno;
 129       if (cd->cd != (iconv_t)(-1))
 130         iconv_close (cd->cd);
 131       errno = saved_errno;
 132       return -1;
 133     }
 134   if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
 135     return -1;
 136   return 0;
 137 }
 138 
 139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
 140    a conversion error, and it returns in *INCREMENTED a boolean telling whether
 141    it has incremented the input pointers past the error location.  */
 142 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 143 /* Irix iconv() inserts a NUL byte if it cannot convert.
 144    NetBSD iconv() inserts a question mark if it cannot convert.
 145    Only GNU libiconv and GNU libc are known to prefer to fail rather
 146    than doing a lossy conversion.  */
 147 static size_t
 148 iconv_carefully (iconv_t cd,
     /* [previous][next][first][last][top][bottom][index][help] */
 149                  const char **inbuf, size_t *inbytesleft,
 150                  char **outbuf, size_t *outbytesleft,
 151                  bool *incremented)
 152 {
 153   const char *inptr = *inbuf;
 154   const char *inptr_end = inptr + *inbytesleft;
 155   char *outptr = *outbuf;
 156   size_t outsize = *outbytesleft;
 157   const char *inptr_before;
 158   size_t res;
 159 
 160   do
 161     {
 162       size_t insize;
 163 
 164       inptr_before = inptr;
 165       res = (size_t)(-1);
 166 
 167       for (insize = 1; inptr + insize <= inptr_end; insize++)
 168         {
 169           res = iconv (cd,
 170                        (ICONV_CONST char **) &inptr, &insize,
 171                        &outptr, &outsize);
 172           if (!(res == (size_t)(-1) && errno == EINVAL))
 173             break;
 174           /* iconv can eat up a shift sequence but give EINVAL while attempting
 175              to convert the first character.  E.g. libiconv does this.  */
 176           if (inptr > inptr_before)
 177             {
 178               res = 0;
 179               break;
 180             }
 181         }
 182 
 183       if (res == 0)
 184         {
 185           *outbuf = outptr;
 186           *outbytesleft = outsize;
 187         }
 188     }
 189   while (res == 0 && inptr < inptr_end);
 190 
 191   *inbuf = inptr;
 192   *inbytesleft = inptr_end - inptr;
 193   if (res != (size_t)(-1) && res > 0)
 194     {
 195       /* iconv() has already incremented INPTR.  We cannot go back to a
 196          previous INPTR, otherwise the state inside CD would become invalid,
 197          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 198          *INBUF has already been incremented.  */
 199       *incremented = (inptr > inptr_before);
 200       errno = EILSEQ;
 201       return (size_t)(-1);
 202     }
 203   else
 204     {
 205       *incremented = false;
 206       return res;
 207     }
 208 }
 209 # else
 210 #  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
 211      (*(incremented) = false, \
 212       iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
 213 # endif
 214 
 215 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
 216    converting one character or one shift sequence.  */
 217 static size_t
 218 iconv_carefully_1 (iconv_t cd,
     /* [previous][next][first][last][top][bottom][index][help] */
 219                    const char **inbuf, size_t *inbytesleft,
 220                    char **outbuf, size_t *outbytesleft,
 221                    bool *incremented)
 222 {
 223   const char *inptr_before = *inbuf;
 224   const char *inptr = inptr_before;
 225   const char *inptr_end = inptr_before + *inbytesleft;
 226   char *outptr = *outbuf;
 227   size_t outsize = *outbytesleft;
 228   size_t res = (size_t)(-1);
 229   size_t insize;
 230 
 231   for (insize = 1; inptr_before + insize <= inptr_end; insize++)
 232     {
 233       inptr = inptr_before;
 234       res = iconv (cd,
 235                    (ICONV_CONST char **) &inptr, &insize,
 236                    &outptr, &outsize);
 237       if (!(res == (size_t)(-1) && errno == EINVAL))
 238         break;
 239       /* iconv can eat up a shift sequence but give EINVAL while attempting
 240          to convert the first character.  E.g. libiconv does this.  */
 241       if (inptr > inptr_before)
 242         {
 243           res = 0;
 244           break;
 245         }
 246     }
 247 
 248   *inbuf = inptr;
 249   *inbytesleft = inptr_end - inptr;
 250 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 251   /* Irix iconv() inserts a NUL byte if it cannot convert.
 252      NetBSD iconv() inserts a question mark if it cannot convert.
 253      Only GNU libiconv and GNU libc are known to prefer to fail rather
 254      than doing a lossy conversion.  */
 255   if (res != (size_t)(-1) && res > 0)
 256     {
 257       /* iconv() has already incremented INPTR.  We cannot go back to a
 258          previous INPTR, otherwise the state inside CD would become invalid,
 259          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 260          *INBUF has already been incremented.  */
 261       *incremented = (inptr > inptr_before);
 262       errno = EILSEQ;
 263       return (size_t)(-1);
 264     }
 265 # endif
 266 
 267   if (res != (size_t)(-1))
 268     {
 269       *outbuf = outptr;
 270       *outbytesleft = outsize;
 271     }
 272   *incremented = false;
 273   return res;
 274 }
 275 
 276 /* utf8conv_carefully is like iconv, except that
 277      - it converts from UTF-8 to UTF-8,
 278      - it stops as soon as it encounters a conversion error, and it returns
 279        in *INCREMENTED a boolean telling whether it has incremented the input
 280        pointers past the error location,
 281      - if one_character_only is true, it stops after converting one
 282        character.  */
 283 static size_t
 284 utf8conv_carefully (bool one_character_only,
     /* [previous][next][first][last][top][bottom][index][help] */
 285                     const char **inbuf, size_t *inbytesleft,
 286                     char **outbuf, size_t *outbytesleft,
 287                     bool *incremented)
 288 {
 289   const char *inptr = *inbuf;
 290   size_t insize = *inbytesleft;
 291   char *outptr = *outbuf;
 292   size_t outsize = *outbytesleft;
 293   size_t res;
 294 
 295   res = 0;
 296   do
 297     {
 298       ucs4_t uc;
 299       int n;
 300       int m;
 301 
 302       n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
 303       if (n < 0)
 304         {
 305           errno = (n == -2 ? EINVAL : EILSEQ);
 306           n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
 307           inptr += n;
 308           insize -= n;
 309           res = (size_t)(-1);
 310           *incremented = true;
 311           break;
 312         }
 313       if (outsize == 0)
 314         {
 315           errno = E2BIG;
 316           res = (size_t)(-1);
 317           *incremented = false;
 318           break;
 319         }
 320       m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
 321       if (m == -2)
 322         {
 323           errno = E2BIG;
 324           res = (size_t)(-1);
 325           *incremented = false;
 326           break;
 327         }
 328       inptr += n;
 329       insize -= n;
 330       if (m == -1)
 331         {
 332           errno = EILSEQ;
 333           res = (size_t)(-1);
 334           *incremented = true;
 335           break;
 336         }
 337       outptr += m;
 338       outsize -= m;
 339     }
 340   while (!one_character_only && insize > 0);
 341 
 342   *inbuf = inptr;
 343   *inbytesleft = insize;
 344   *outbuf = outptr;
 345   *outbytesleft = outsize;
 346   return res;
 347 }
 348 
 349 static int
 350 mem_cd_iconveh_internal (const char *src, size_t srclen,
     /* [previous][next][first][last][top][bottom][index][help] */
 351                          iconv_t cd, iconv_t cd1, iconv_t cd2,
 352                          enum iconv_ilseq_handler handler,
 353                          size_t extra_alloc,
 354                          size_t *offsets,
 355                          char **resultp, size_t *lengthp)
 356 {
 357   /* When a conversion error occurs, we cannot start using CD1 and CD2 at
 358      this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
 359      Instead, we have to start afresh from the beginning of SRC.  */
 360   /* Use a temporary buffer, so that for small strings, a single malloc()
 361      call will be sufficient.  */
 362 # define tmpbufsize 4096
 363   /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
 364      libiconv's UCS-4-INTERNAL encoding.  */
 365   union { unsigned int align; char buf[tmpbufsize]; } tmp;
 366 # define tmpbuf tmp.buf
 367 
 368   char *initial_result;
 369   char *result;
 370   size_t allocated;
 371   size_t length;
 372   size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
 373 
 374   if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
 375     {
 376       initial_result = *resultp;
 377       allocated = *lengthp;
 378     }
 379   else
 380     {
 381       initial_result = tmpbuf;
 382       allocated = sizeof (tmpbuf);
 383     }
 384   result = initial_result;
 385 
 386   /* Test whether a direct conversion is possible at all.  */
 387   if (cd == (iconv_t)(-1))
 388     goto indirectly;
 389 
 390   if (offsets != NULL)
 391     {
 392       size_t i;
 393 
 394       for (i = 0; i < srclen; i++)
 395         offsets[i] = (size_t)(-1);
 396 
 397       last_length = (size_t)(-1);
 398     }
 399   length = 0;
 400 
 401   /* First, try a direct conversion, and see whether a conversion error
 402      occurs at all.  */
 403   {
 404     const char *inptr = src;
 405     size_t insize = srclen;
 406 
 407     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 408 # if defined _LIBICONV_VERSION \
 409      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 410           || defined __sun)
 411     /* Set to the initial state.  */
 412     iconv (cd, NULL, NULL, NULL, NULL);
 413 # endif
 414 
 415     while (insize > 0)
 416       {
 417         char *outptr = result + length;
 418         size_t outsize = allocated - extra_alloc - length;
 419         bool incremented;
 420         size_t res;
 421         bool grow;
 422 
 423         if (offsets != NULL)
 424           {
 425             if (length != last_length) /* ensure that offset[] be increasing */
 426               {
 427                 offsets[inptr - src] = length;
 428                 last_length = length;
 429               }
 430             res = iconv_carefully_1 (cd,
 431                                      &inptr, &insize,
 432                                      &outptr, &outsize,
 433                                      &incremented);
 434           }
 435         else
 436           /* Use iconv_carefully instead of iconv here, because:
 437              - If TO_CODESET is UTF-8, we can do the error handling in this
 438                loop, no need for a second loop,
 439              - With iconv() implementations other than GNU libiconv and GNU
 440                libc, if we use iconv() in a big swoop, checking for an E2BIG
 441                return, we lose the number of irreversible conversions.  */
 442           res = iconv_carefully (cd,
 443                                  &inptr, &insize,
 444                                  &outptr, &outsize,
 445                                  &incremented);
 446 
 447         length = outptr - result;
 448         grow = (length + extra_alloc > allocated / 2);
 449         if (res == (size_t)(-1))
 450           {
 451             if (errno == E2BIG)
 452               grow = true;
 453             else if (errno == EINVAL)
 454               break;
 455             else if (errno == EILSEQ && handler != iconveh_error)
 456               {
 457                 if (cd2 == (iconv_t)(-1))
 458                   {
 459                     /* TO_CODESET is UTF-8.  */
 460                     /* Error handling can produce up to 1 byte of output.  */
 461                     if (length + 1 + extra_alloc > allocated)
 462                       {
 463                         char *memory;
 464 
 465                         allocated = 2 * allocated;
 466                         if (length + 1 + extra_alloc > allocated)
 467                           abort ();
 468                         if (result == initial_result)
 469                           memory = (char *) malloc (allocated);
 470                         else
 471                           memory = (char *) realloc (result, allocated);
 472                         if (memory == NULL)
 473                           {
 474                             if (result != initial_result)
 475                               free (result);
 476                             errno = ENOMEM;
 477                             return -1;
 478                           }
 479                         if (result == initial_result)
 480                           memcpy (memory, initial_result, length);
 481                         result = memory;
 482                         grow = false;
 483                       }
 484                     /* The input is invalid in FROM_CODESET.  Eat up one byte
 485                        and emit a question mark.  */
 486                     if (!incremented)
 487                       {
 488                         if (insize == 0)
 489                           abort ();
 490                         inptr++;
 491                         insize--;
 492                       }
 493                     result[length] = '?';
 494                     length++;
 495                   }
 496                 else
 497                   goto indirectly;
 498               }
 499             else
 500               {
 501                 if (result != initial_result)
 502                   free (result);
 503                 return -1;
 504               }
 505           }
 506         if (insize == 0)
 507           break;
 508         if (grow)
 509           {
 510             char *memory;
 511 
 512             allocated = 2 * allocated;
 513             if (result == initial_result)
 514               memory = (char *) malloc (allocated);
 515             else
 516               memory = (char *) realloc (result, allocated);
 517             if (memory == NULL)
 518               {
 519                 if (result != initial_result)
 520                   free (result);
 521                 errno = ENOMEM;
 522                 return -1;
 523               }
 524             if (result == initial_result)
 525               memcpy (memory, initial_result, length);
 526             result = memory;
 527           }
 528       }
 529   }
 530 
 531   /* Now get the conversion state back to the initial state.
 532      But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 533 #if defined _LIBICONV_VERSION \
 534     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 535          || defined __sun)
 536   for (;;)
 537     {
 538       char *outptr = result + length;
 539       size_t outsize = allocated - extra_alloc - length;
 540       size_t res;
 541 
 542       res = iconv (cd, NULL, NULL, &outptr, &outsize);
 543       length = outptr - result;
 544       if (res == (size_t)(-1))
 545         {
 546           if (errno == E2BIG)
 547             {
 548               char *memory;
 549 
 550               allocated = 2 * allocated;
 551               if (result == initial_result)
 552                 memory = (char *) malloc (allocated);
 553               else
 554                 memory = (char *) realloc (result, allocated);
 555               if (memory == NULL)
 556                 {
 557                   if (result != initial_result)
 558                     free (result);
 559                   errno = ENOMEM;
 560                   return -1;
 561                 }
 562               if (result == initial_result)
 563                 memcpy (memory, initial_result, length);
 564               result = memory;
 565             }
 566           else
 567             {
 568               if (result != initial_result)
 569                 free (result);
 570               return -1;
 571             }
 572         }
 573       else
 574         break;
 575     }
 576 #endif
 577 
 578   /* The direct conversion succeeded.  */
 579   goto done;
 580 
 581  indirectly:
 582   /* The direct conversion failed.
 583      Use a conversion through UTF-8.  */
 584   if (offsets != NULL)
 585     {
 586       size_t i;
 587 
 588       for (i = 0; i < srclen; i++)
 589         offsets[i] = (size_t)(-1);
 590 
 591       last_length = (size_t)(-1);
 592     }
 593   length = 0;
 594   {
 595     const bool slowly = (offsets != NULL || handler == iconveh_error);
 596 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
 597     char utf8buf[utf8bufsize + 1];
 598     size_t utf8len = 0;
 599     const char *in1ptr = src;
 600     size_t in1size = srclen;
 601     bool do_final_flush1 = true;
 602     bool do_final_flush2 = true;
 603 
 604     /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
 605 # if defined _LIBICONV_VERSION \
 606      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 607           || defined __sun)
 608     /* Set to the initial state.  */
 609     if (cd1 != (iconv_t)(-1))
 610       iconv (cd1, NULL, NULL, NULL, NULL);
 611     if (cd2 != (iconv_t)(-1))
 612       iconv (cd2, NULL, NULL, NULL, NULL);
 613 # endif
 614 
 615     while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
 616       {
 617         char *out1ptr = utf8buf + utf8len;
 618         size_t out1size = utf8bufsize - utf8len;
 619         bool incremented1;
 620         size_t res1;
 621         int errno1;
 622 
 623         /* Conversion step 1: from FROM_CODESET to UTF-8.  */
 624         if (in1size > 0)
 625           {
 626             if (offsets != NULL
 627                 && length != last_length) /* ensure that offset[] be increasing */
 628               {
 629                 offsets[in1ptr - src] = length;
 630                 last_length = length;
 631               }
 632             if (cd1 != (iconv_t)(-1))
 633               {
 634                 if (slowly)
 635                   res1 = iconv_carefully_1 (cd1,
 636                                             &in1ptr, &in1size,
 637                                             &out1ptr, &out1size,
 638                                             &incremented1);
 639                 else
 640                   res1 = iconv_carefully (cd1,
 641                                           &in1ptr, &in1size,
 642                                           &out1ptr, &out1size,
 643                                           &incremented1);
 644               }
 645             else
 646               {
 647                 /* FROM_CODESET is UTF-8.  */
 648                 res1 = utf8conv_carefully (slowly,
 649                                            &in1ptr, &in1size,
 650                                            &out1ptr, &out1size,
 651                                            &incremented1);
 652               }
 653           }
 654         else if (do_final_flush1)
 655           {
 656             /* Now get the conversion state of CD1 back to the initial state.
 657                But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 658 # if defined _LIBICONV_VERSION \
 659      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 660           || defined __sun)
 661             if (cd1 != (iconv_t)(-1))
 662               res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
 663             else
 664 # endif
 665               res1 = 0;
 666             do_final_flush1 = false;
 667             incremented1 = true;
 668           }
 669         else
 670           {
 671             res1 = 0;
 672             incremented1 = true;
 673           }
 674         if (res1 == (size_t)(-1)
 675             && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
 676           {
 677             if (result != initial_result)
 678               free (result);
 679             return -1;
 680           }
 681         if (res1 == (size_t)(-1)
 682             && errno == EILSEQ && handler != iconveh_error)
 683           {
 684             /* The input is invalid in FROM_CODESET.  Eat up one byte and
 685                emit a question mark.  Room for the question mark was allocated
 686                at the end of utf8buf.  */
 687             if (!incremented1)
 688               {
 689                 if (in1size == 0)
 690                   abort ();
 691                 in1ptr++;
 692                 in1size--;
 693               }
 694             *out1ptr++ = '?';
 695             res1 = 0;
 696           }
 697         errno1 = errno;
 698         utf8len = out1ptr - utf8buf;
 699 
 700         if (offsets != NULL
 701             || in1size == 0
 702             || utf8len > utf8bufsize / 2
 703             || (res1 == (size_t)(-1) && errno1 == E2BIG))
 704           {
 705             /* Conversion step 2: from UTF-8 to TO_CODESET.  */
 706             const char *in2ptr = utf8buf;
 707             size_t in2size = utf8len;
 708 
 709             while (in2size > 0
 710                    || (in1size == 0 && !do_final_flush1 && do_final_flush2))
 711               {
 712                 char *out2ptr = result + length;
 713                 size_t out2size = allocated - extra_alloc - length;
 714                 bool incremented2;
 715                 size_t res2;
 716                 bool grow;
 717 
 718                 if (in2size > 0)
 719                   {
 720                     if (cd2 != (iconv_t)(-1))
 721                       res2 = iconv_carefully (cd2,
 722                                               &in2ptr, &in2size,
 723                                               &out2ptr, &out2size,
 724                                               &incremented2);
 725                     else
 726                       /* TO_CODESET is UTF-8.  */
 727                       res2 = utf8conv_carefully (false,
 728                                                  &in2ptr, &in2size,
 729                                                  &out2ptr, &out2size,
 730                                                  &incremented2);
 731                   }
 732                 else /* in1size == 0 && !do_final_flush1
 733                         && in2size == 0 && do_final_flush2 */
 734                   {
 735                     /* Now get the conversion state of CD1 back to the initial
 736                        state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 737 # if defined _LIBICONV_VERSION \
 738      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 739           || defined __sun)
 740                     if (cd2 != (iconv_t)(-1))
 741                       res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
 742                     else
 743 # endif
 744                       res2 = 0;
 745                     do_final_flush2 = false;
 746                     incremented2 = true;
 747                   }
 748 
 749                 length = out2ptr - result;
 750                 grow = (length + extra_alloc > allocated / 2);
 751                 if (res2 == (size_t)(-1))
 752                   {
 753                     if (errno == E2BIG)
 754                       grow = true;
 755                     else if (errno == EINVAL)
 756                       break;
 757                     else if (errno == EILSEQ && handler != iconveh_error)
 758                       {
 759                         /* Error handling can produce up to 10 bytes of ASCII
 760                            output.  But TO_CODESET may be UCS-2, UTF-16 or
 761                            UCS-4, so use CD2 here as well.  */
 762                         char scratchbuf[10];
 763                         size_t scratchlen;
 764                         ucs4_t uc;
 765                         const char *inptr;
 766                         size_t insize;
 767                         size_t res;
 768 
 769                         if (incremented2)
 770                           {
 771                             if (u8_prev (&uc, (const uint8_t *) in2ptr,
 772                                          (const uint8_t *) utf8buf)
 773                                 == NULL)
 774                               abort ();
 775                           }
 776                         else
 777                           {
 778                             int n;
 779                             if (in2size == 0)
 780                               abort ();
 781                             n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
 782                                                   in2size);
 783                             in2ptr += n;
 784                             in2size -= n;
 785                           }
 786 
 787                         if (handler == iconveh_escape_sequence)
 788                           {
 789                             static char hex[16] = "0123456789ABCDEF";
 790                             scratchlen = 0;
 791                             scratchbuf[scratchlen++] = '\\';
 792                             if (uc < 0x10000)
 793                               scratchbuf[scratchlen++] = 'u';
 794                             else
 795                               {
 796                                 scratchbuf[scratchlen++] = 'U';
 797                                 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
 798                                 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
 799                                 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
 800                                 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
 801                               }
 802                             scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
 803                             scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
 804                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
 805                             scratchbuf[scratchlen++] = hex[uc & 15];
 806                           }
 807                         else
 808                           {
 809                             scratchbuf[0] = '?';
 810                             scratchlen = 1;
 811                           }
 812 
 813                         inptr = scratchbuf;
 814                         insize = scratchlen;
 815                         if (cd2 != (iconv_t)(-1))
 816                           res = iconv (cd2,
 817                                        (ICONV_CONST char **) &inptr, &insize,
 818                                        &out2ptr, &out2size);
 819                         else
 820                           {
 821                             /* TO_CODESET is UTF-8.  */
 822                             if (out2size >= insize)
 823                               {
 824                                 memcpy (out2ptr, inptr, insize);
 825                                 out2ptr += insize;
 826                                 out2size -= insize;
 827                                 inptr += insize;
 828                                 insize = 0;
 829                                 res = 0;
 830                               }
 831                             else
 832                               {
 833                                 errno = E2BIG;
 834                                 res = (size_t)(-1);
 835                               }
 836                           }
 837                         length = out2ptr - result;
 838                         if (res == (size_t)(-1) && errno == E2BIG)
 839                           {
 840                             char *memory;
 841 
 842                             allocated = 2 * allocated;
 843                             if (length + 1 + extra_alloc > allocated)
 844                               abort ();
 845                             if (result == initial_result)
 846                               memory = (char *) malloc (allocated);
 847                             else
 848                               memory = (char *) realloc (result, allocated);
 849                             if (memory == NULL)
 850                               {
 851                                 if (result != initial_result)
 852                                   free (result);
 853                                 errno = ENOMEM;
 854                                 return -1;
 855                               }
 856                             if (result == initial_result)
 857                               memcpy (memory, initial_result, length);
 858                             result = memory;
 859                             grow = false;
 860 
 861                             out2ptr = result + length;
 862                             out2size = allocated - extra_alloc - length;
 863                             if (cd2 != (iconv_t)(-1))
 864                               res = iconv (cd2,
 865                                            (ICONV_CONST char **) &inptr,
 866                                            &insize,
 867                                            &out2ptr, &out2size);
 868                             else
 869                               {
 870                                 /* TO_CODESET is UTF-8.  */
 871                                 if (!(out2size >= insize))
 872                                   abort ();
 873                                 memcpy (out2ptr, inptr, insize);
 874                                 out2ptr += insize;
 875                                 out2size -= insize;
 876                                 inptr += insize;
 877                                 insize = 0;
 878                                 res = 0;
 879                               }
 880                             length = out2ptr - result;
 881                           }
 882 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
 883                         /* Irix iconv() inserts a NUL byte if it cannot convert.
 884                            NetBSD iconv() inserts a question mark if it cannot
 885                            convert.
 886                            Only GNU libiconv and GNU libc are known to prefer
 887                            to fail rather than doing a lossy conversion.  */
 888                         if (res != (size_t)(-1) && res > 0)
 889                           {
 890                             errno = EILSEQ;
 891                             res = (size_t)(-1);
 892                           }
 893 # endif
 894                         if (res == (size_t)(-1))
 895                           {
 896                             /* Failure converting the ASCII replacement.  */
 897                             if (result != initial_result)
 898                               free (result);
 899                             return -1;
 900                           }
 901                       }
 902                     else
 903                       {
 904                         if (result != initial_result)
 905                           free (result);
 906                         return -1;
 907                       }
 908                   }
 909                 if (!(in2size > 0
 910                       || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
 911                   break;
 912                 if (grow)
 913                   {
 914                     char *memory;
 915 
 916                     allocated = 2 * allocated;
 917                     if (result == initial_result)
 918                       memory = (char *) malloc (allocated);
 919                     else
 920                       memory = (char *) realloc (result, allocated);
 921                     if (memory == NULL)
 922                       {
 923                         if (result != initial_result)
 924                           free (result);
 925                         errno = ENOMEM;
 926                         return -1;
 927                       }
 928                     if (result == initial_result)
 929                       memcpy (memory, initial_result, length);
 930                     result = memory;
 931                   }
 932               }
 933 
 934             /* Move the remaining bytes to the beginning of utf8buf.  */
 935             if (in2size > 0)
 936               memmove (utf8buf, in2ptr, in2size);
 937             utf8len = in2size;
 938           }
 939 
 940         if (res1 == (size_t)(-1))
 941           {
 942             if (errno1 == EINVAL)
 943               in1size = 0;
 944             else if (errno1 == EILSEQ)
 945               {
 946                 if (result != initial_result)
 947                   free (result);
 948                 errno = errno1;
 949                 return -1;
 950               }
 951           }
 952       }
 953 # undef utf8bufsize
 954   }
 955 
 956  done:
 957   /* Now the final memory allocation.  */
 958   if (result == tmpbuf)
 959     {
 960       size_t memsize = length + extra_alloc;
 961 
 962       if (*resultp != NULL && *lengthp >= memsize)
 963         result = *resultp;
 964       else
 965         {
 966           char *memory;
 967 
 968           memory = (char *) malloc (memsize > 0 ? memsize : 1);
 969           if (memory != NULL)
 970             result = memory;
 971           else
 972             {
 973               errno = ENOMEM;
 974               return -1;
 975             }
 976         }
 977       memcpy (result, tmpbuf, length);
 978     }
 979   else if (result != *resultp && length + extra_alloc < allocated)
 980     {
 981       /* Shrink the allocated memory if possible.  */
 982       size_t memsize = length + extra_alloc;
 983       char *memory;
 984 
 985       memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
 986       if (memory != NULL)
 987         result = memory;
 988     }
 989   *resultp = result;
 990   *lengthp = length;
 991   return 0;
 992 # undef tmpbuf
 993 # undef tmpbufsize
 994 }
 995 
 996 int
 997 mem_cd_iconveh (const char *src, size_t srclen,
     /* [previous][next][first][last][top][bottom][index][help] */
 998                 const iconveh_t *cd,
 999                 enum iconv_ilseq_handler handler,
1000                 size_t *offsets,
1001                 char **resultp, size_t *lengthp)
1002 {
1003   return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1004                                   handler, 0, offsets, resultp, lengthp);
1005 }
1006 
1007 char *
1008 str_cd_iconveh (const char *src,
     /* [previous][next][first][last][top][bottom][index][help] */
1009                 const iconveh_t *cd,
1010                 enum iconv_ilseq_handler handler)
1011 {
1012   /* For most encodings, a trailing NUL byte in the input will be converted
1013      to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1014      function is usable for UTF-7, we have to exclude the NUL byte from the
1015      conversion and add it by hand afterwards.  */
1016   char *result = NULL;
1017   size_t length = 0;
1018   int retval = mem_cd_iconveh_internal (src, strlen (src),
1019                                         cd->cd, cd->cd1, cd->cd2, handler, 1,
1020                                         NULL, &result, &length);
1021 
1022   if (retval < 0)
1023     {
1024       free (result);
1025       return NULL;
1026     }
1027 
1028   /* Add the terminating NUL byte.  */
1029   result[length] = '\0';
1030 
1031   return result;
1032 }
1033 
1034 #endif
1035 
1036 int
1037 mem_iconveh (const char *src, size_t srclen,
     /* [previous][next][first][last][top][bottom][index][help] */
1038              const char *from_codeset, const char *to_codeset,
1039              enum iconv_ilseq_handler handler,
1040              size_t *offsets,
1041              char **resultp, size_t *lengthp)
1042 {
1043   if (srclen == 0)
1044     {
1045       /* Nothing to convert.  */
1046       *lengthp = 0;
1047       return 0;
1048     }
1049   else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1050     {
1051       char *result;
1052 
1053       if (*resultp != NULL && *lengthp >= srclen)
1054         result = *resultp;
1055       else
1056         {
1057           result = (char *) malloc (srclen);
1058           if (result == NULL)
1059             {
1060               errno = ENOMEM;
1061               return -1;
1062             }
1063         }
1064       memcpy (result, src, srclen);
1065       *resultp = result;
1066       *lengthp = srclen;
1067       return 0;
1068     }
1069   else
1070     {
1071 #if HAVE_ICONV
1072       iconveh_t cd;
1073       char *result;
1074       size_t length;
1075       int retval;
1076 
1077       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1078         return -1;
1079 
1080       result = *resultp;
1081       length = *lengthp;
1082       retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1083                                &result, &length);
1084 
1085       if (retval < 0)
1086         {
1087           /* Close cd, but preserve the errno from str_cd_iconv.  */
1088           int saved_errno = errno;
1089           iconveh_close (&cd);
1090           errno = saved_errno;
1091         }
1092       else
1093         {
1094           if (iconveh_close (&cd) < 0)
1095             {
1096               if (result != *resultp)
1097                 free (result);
1098               return -1;
1099             }
1100           *resultp = result;
1101           *lengthp = length;
1102         }
1103       return retval;
1104 #else
1105       /* This is a different error code than if iconv_open existed but didn't
1106          support from_codeset and to_codeset, so that the caller can emit
1107          an error message such as
1108            "iconv() is not supported. Installing GNU libiconv and
1109             then reinstalling this package would fix this."  */
1110       errno = ENOSYS;
1111       return -1;
1112 #endif
1113     }
1114 }
1115 
1116 char *
1117 str_iconveh (const char *src,
     /* [previous][next][first][last][top][bottom][index][help] */
1118              const char *from_codeset, const char *to_codeset,
1119              enum iconv_ilseq_handler handler)
1120 {
1121   if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1122     {
1123       char *result = strdup (src);
1124 
1125       if (result == NULL)
1126         errno = ENOMEM;
1127       return result;
1128     }
1129   else
1130     {
1131 #if HAVE_ICONV
1132       iconveh_t cd;
1133       char *result;
1134 
1135       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1136         return NULL;
1137 
1138       result = str_cd_iconveh (src, &cd, handler);
1139 
1140       if (result == NULL)
1141         {
1142           /* Close cd, but preserve the errno from str_cd_iconv.  */
1143           int saved_errno = errno;
1144           iconveh_close (&cd);
1145           errno = saved_errno;
1146         }
1147       else
1148         {
1149           if (iconveh_close (&cd) < 0)
1150             {
1151               free (result);
1152               return NULL;
1153             }
1154         }
1155       return result;
1156 #else
1157       /* This is a different error code than if iconv_open existed but didn't
1158          support from_codeset and to_codeset, so that the caller can emit
1159          an error message such as
1160            "iconv() is not supported. Installing GNU libiconv and
1161             then reinstalling this package would fix this."  */
1162       errno = ENOSYS;
1163       return NULL;
1164 #endif
1165     }
1166 }

/* [previous][next][first][last][top][bottom][index][help] */