root/maint/gnulib/lib/unilbrk/ulc-width-linebreaks.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ulc_width_linebreaks
  2. read_file
  3. main

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4 
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation; either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15 
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21 
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25 
  26 #include <config.h>
  27 
  28 /* Specification.  */
  29 #include "unilbrk.h"
  30 
  31 #include <stdlib.h>
  32 #include <string.h>
  33 
  34 #include "c-ctype.h"
  35 #include "uniconv.h"
  36 #include "unilbrk/ulc-common.h"
  37 
  38 /* Line breaking of a string in an arbitrary encoding.
  39 
  40    We convert the input string to Unicode.
  41 
  42    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  43    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  44    \U0000FFFF.  UTF-16 and variants support only characters up to
  45    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  46    UCS-4 specification leaves doubts about endianness and byte order mark.
  47    glibc currently interprets it as big endian without byte order mark,
  48    but this is not backed by an RFC.  So we use UTF-8. It supports
  49    characters up to \U7FFFFFFF and is unambiguously defined.  */
  50 
  51 int
  52 ulc_width_linebreaks (const char *s, size_t n,
     /* [previous][next][first][last][top][bottom][index][help] */
  53                       int width, int start_column, int at_end_columns,
  54                       const char *o, const char *encoding,
  55                       char *p)
  56 {
  57   if (n > 0)
  58     {
  59       if (is_utf8_encoding (encoding))
  60         return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
  61       else
  62         {
  63           /* Convert the string to UTF-8 and build a translation table
  64              from offsets into s to offsets into the translated string.  */
  65           size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
  66 
  67           if (offsets != NULL)
  68             {
  69               uint8_t *t;
  70               size_t m;
  71 
  72               t = u8_conv_from_encoding (encoding, iconveh_question_mark,
  73                                          s, n, offsets, NULL, &m);
  74               if (t != NULL)
  75                 {
  76                   char *memory =
  77                     (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
  78 
  79                   if (m == 0 || memory != NULL)
  80                     {
  81                       char *q = (char *) memory;
  82                       char *o8 = (o != NULL ? (char *) (q + m) : NULL);
  83                       int res_column;
  84                       size_t i;
  85 
  86                       /* Translate the overrides to the UTF-8 string.  */
  87                       if (o != NULL)
  88                         {
  89                           memset (o8, UC_BREAK_UNDEFINED, m);
  90                           for (i = 0; i < n; i++)
  91                             if (offsets[i] != (size_t)(-1))
  92                               o8[offsets[i]] = o[i];
  93                         }
  94 
  95                       /* Determine the line breaks of the UTF-8 string.  */
  96                       res_column =
  97                         u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
  98 
  99                       /* Translate the result back to the original string.  */
 100                       memset (p, UC_BREAK_PROHIBITED, n);
 101                       for (i = 0; i < n; i++)
 102                         if (offsets[i] != (size_t)(-1))
 103                           p[i] = q[offsets[i]];
 104 
 105                       free (memory);
 106                       free (t);
 107                       free (offsets);
 108                       return res_column;
 109                     }
 110                   free (t);
 111                 }
 112               free (offsets);
 113             }
 114           /* Impossible to convert.  */
 115 #if C_CTYPE_ASCII
 116           if (is_all_ascii (s, n))
 117             {
 118               /* ASCII is a subset of UTF-8.  */
 119               return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
 120             }
 121 #endif
 122           /* We have a non-ASCII string and cannot convert it.
 123              Don't produce line breaks except those already present in the
 124              input string.  All we assume here is that the encoding is
 125              minimally ASCII compatible.  */
 126           {
 127             const char *s_end = s + n;
 128             while (s < s_end)
 129               {
 130                 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
 131                       ? UC_BREAK_MANDATORY
 132                       : UC_BREAK_PROHIBITED);
 133                 s++;
 134                 p++;
 135                 if (o != NULL)
 136                   o++;
 137               }
 138             /* We cannot compute widths in this case.  */
 139           }
 140         }
 141     }
 142   return start_column;
 143 }
 144 
 145 
 146 #ifdef TEST
 147 
 148 #include <stdio.h>
 149 #include <locale.h>
 150 
 151 /* Read the contents of an input stream, and return it, terminated with a NUL
 152    byte. */
 153 char *
 154 read_file (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
 155 {
 156 #define BUFSIZE 4096
 157   char *buf = NULL;
 158   int alloc = 0;
 159   int size = 0;
 160   int count;
 161 
 162   while (! feof (stream))
 163     {
 164       if (size + BUFSIZE > alloc)
 165         {
 166           alloc = alloc + alloc / 2;
 167           if (alloc < size + BUFSIZE)
 168             alloc = size + BUFSIZE;
 169           buf = realloc (buf, alloc);
 170           if (buf == NULL)
 171             {
 172               fprintf (stderr, "out of memory\n");
 173               exit (1);
 174             }
 175         }
 176       count = fread (buf + size, 1, BUFSIZE, stream);
 177       if (count == 0)
 178         {
 179           if (ferror (stream))
 180             {
 181               perror ("fread");
 182               exit (1);
 183             }
 184         }
 185       else
 186         size += count;
 187     }
 188   buf = realloc (buf, size + 1);
 189   if (buf == NULL)
 190     {
 191       fprintf (stderr, "out of memory\n");
 192       exit (1);
 193     }
 194   buf[size] = '\0';
 195   return buf;
 196 #undef BUFSIZE
 197 }
 198 
 199 int
 200 main (int argc, char * argv[])
     /* [previous][next][first][last][top][bottom][index][help] */
 201 {
 202   setlocale (LC_CTYPE, "");
 203   if (argc == 2)
 204     {
 205       /* Insert line breaks for a given width.  */
 206       int width = atoi (argv[1]);
 207       char *input = read_file (stdin);
 208       int length = strlen (input);
 209       char *breaks = malloc (length);
 210       int i;
 211 
 212       ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 213 
 214       for (i = 0; i < length; i++)
 215         {
 216           switch (breaks[i])
 217             {
 218             case UC_BREAK_POSSIBLE:
 219               putc ('\n', stdout);
 220               break;
 221             case UC_BREAK_MANDATORY:
 222               break;
 223             case UC_BREAK_PROHIBITED:
 224               break;
 225             default:
 226               abort ();
 227             }
 228           putc (input[i], stdout);
 229         }
 230 
 231       free (breaks);
 232 
 233       return 0;
 234     }
 235   else
 236     return 1;
 237 }
 238 
 239 #endif /* TEST */

/* [previous][next][first][last][top][bottom][index][help] */