re PR libfortran/35863 ([F2003] Implement ENCODING="UTF-8")

2008-08-15 Jerry DeLisle <jvdelisle@gcc.gnu.org> PR libfortran/35863 * intrinsics/selected_char_kind.c: Enable iso_10646. * io/read.c (typedef uchar): New type. (read_utf8): New function to read a single UTF-8 encoded character. (read_utf8_char1): New function to read UTF-8 into a KIND=1 string. (read_default_char1): New functio to read default into KIND=1 string. (read_utf8_char4): New function to read UTF-8 into a KIND=4 string. (read_default_char4): New function to read UTF-8 into a KIND=4 string. (read_a): Modify to use the new functions. (read_a_char4): Modify to use the new functions. * io/write.c (error.h): Add include. (typedef uchar): New type. (write_default_char4): New function to default write KIND=4 string. (write_utf8_char4): New function to UTF-8 write KIND=4 string. (write_a_char4): Modify to use new functions. (write_character): Modify to use new functions. From-SVN: r139147
2008-08-16 03:38:31 +00:00 · 2008-08-16 03:38:31 +00:00 · 3ae86bf4f4
parent dad80a1bff
commit 3ae86bf4f4
4 changed files with 407 additions and 109 deletions
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@ -1,3 +1,21 @@
 2008-08-15  Jerry DeLisle  <jvdelisle@gcc.gnu.org>
 	PR libfortran/35863
 	* intrinsics/selected_char_kind.c: Enable iso_10646.
 	* io/read.c (typedef uchar): New type.
 	(read_utf8): New function to read a single UTF-8 encoded character.
 	(read_utf8_char1): New function to read UTF-8 into a KIND=1 string.
 	(read_default_char1): New functio to read default into KIND=1 string.
 	(read_utf8_char4): New function to read UTF-8 into a KIND=4 string.
 	(read_default_char4): New function to read UTF-8 into a KIND=4 string.
 	(read_a): Modify to use the new functions.
 	(read_a_char4): Modify to use the new functions.
 	* io/write.c (error.h): Add include. (typedef uchar): New type.
 	(write_default_char4): New function to default write KIND=4 string.
 	(write_utf8_char4): New function to UTF-8 write KIND=4 string.
 	(write_a_char4): Modify to use new functions.
 	(write_character): Modify to use new functions.
 2008-08-14  H.J. Lu  <hongjiu.lu@intel.com>
 	PR libfortran/37123
--- a/libgfortran/intrinsics/selected_char_kind.c
+++ b/libgfortran/intrinsics/selected_char_kind.c
@ -44,6 +44,8 @@ selected_char_kind (gfc_charlen_type name_len, char *name)
  if ((len == 5 && strncasecmp (name, "ascii", 5) == 0)
      || (len == 7 && strncasecmp (name, "default", 7) == 0))
    return 1;
  else if (len == 9 && strncasecmp (name, "iso_10646", 9) == 0)
    return 1;
  else
    return -1;
 }
--- a/libgfortran/io/read.c
+++ b/libgfortran/io/read.c
@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA.  */
 #include <ctype.h>
 #include <stdlib.h>
 typedef unsigned char uchar;
 /* read.c -- Deal with formatted reads */
@ -236,78 +238,239 @@ read_l (st_parameter_dt *dtp, const fnode *f, char *dest, int length)
 }
-/* read_a()-- Read a character record.  This one is pretty easy. */
+static inline gfc_char4_t
 read_utf8 (st_parameter_dt *dtp, size_t *nbytes) 
 {
  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  static uchar buffer[6];
  size_t i, nb, nread;
  gfc_char4_t c;
  int status;
  char *s;
-void
+  *nbytes = 1;
-read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+  s = (char *) &buffer[0];
  status = read_block_form (dtp, s, nbytes);
  if (status == FAILURE)
    return 0;
  /* If this is a short read, just return.  */
  if (*nbytes == 0)
    return 0;
  c = buffer[0];
  if (c < 0x80)
    return c;
  /* The number of leading 1-bits in the first byte indicates how many
     bytes follow.  */
  for (nb = 2; nb < 7; nb++)
    if ((c & ~masks[nb-1]) == patns[nb-1])
      goto found;
  goto invalid;
 found:
  c = (c & masks[nb-1]);
  nread = nb - 1;
  s = (char *) &buffer[1];
  status = read_block_form (dtp, s, &nread);
  if (status == FAILURE)
    return 0;
  /* Decode the bytes read.  */
  for (i = 1; i < nb; i++)
    {
      gfc_char4_t n = *s++;
      if ((n & 0xC0) != 0x80)
 	goto invalid;
      c = ((c << 6) + (n & 0x3F));
    }
  /* Make sure the shortest possible encoding was used.  */
  if (c <=      0x7F && nb > 1) goto invalid;
  if (c <=     0x7FF && nb > 2) goto invalid;
  if (c <=    0xFFFF && nb > 3) goto invalid;
  if (c <=  0x1FFFFF && nb > 4) goto invalid;
  if (c <= 0x3FFFFFF && nb > 5) goto invalid;
  /* Make sure the character is valid.  */
  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
    goto invalid;
  return c;
 invalid:
  generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
  return (gfc_char4_t) '?';
 }
 static void
 read_utf8_char1 (st_parameter_dt *dtp, char *p, int len, size_t width)
 {
  gfc_char4_t c;
  char *dest;
  size_t nbytes;
  int i, j;
  len = ((int) width < len) ? len : (int) width;
  dest = (char *) p;
  /* Proceed with decoding one character at a time.  */
  for (j = 0; j < len; j++, dest++)
    {
      c = read_utf8 (dtp, &nbytes);
      /* Check for a short read and if so, break out.  */
      if (nbytes == 0)
 	break;
      *dest = c > 255 ? '?' : (uchar) c;
    }
  /* If there was a short read, pad the remaining characters.  */
  for (i = j; i < len; i++)
    *dest++ = ' ';
  return;
 }
 static void
 read_default_char1 (st_parameter_dt *dtp, char *p, int len, size_t width)
 {
  char *s;
-  int m, n, wi, status;
+  int m, n, status;
  size_t w;
-  wi = f->u.w;
+  s = gfc_alloca (width);
  if (wi == -1) /* '(A)' edit descriptor  */
    wi = length;
-  w = wi;
+  status = read_block_form (dtp, s, &width);
-
+  
  s = gfc_alloca (w);
  dtp->u.p.sf_read_comma = 0;
  status = read_block_form (dtp, s, &w);
  dtp->u.p.sf_read_comma =
    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
  if (status == FAILURE)
    return;
-  if (w > (size_t) length)
+  if (width > (size_t) len)
-     s += (w - length);
+     s += (width - len);
-  m = ((int) w > length) ? length : (int) w;
+  m = ((int) width > len) ? len : (int) width;
  memcpy (p, s, m);
-  n = length - w;
+  n = len - width;
  if (n > 0)
    memset (p + m, ' ', n);
 }
-void
+
-read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+static void
 read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width)
 {
  gfc_char4_t *dest;
  size_t nbytes;
  int i, j;
  len = ((int) width < len) ? len : (int) width;
  dest = (gfc_char4_t *) p;
  /* Proceed with decoding one character at a time.  */
  for (j = 0; j < len; j++, dest++)
    {
      *dest = read_utf8 (dtp, &nbytes);
      /* Check for a short read and if so, break out.  */
      if (nbytes == 0)
 	break;
    }
  /* If there was a short read, pad the remaining characters.  */
  for (i = j; i < len; i++)
    *dest++ = (gfc_char4_t) ' ';
  return;
 }
 static void
 read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width)
 {
  char *s;
  gfc_char4_t *dest;
-  int m, n, wi, status;
+  int m, n, status;
  size_t w;
-  wi = f->u.w;
+  s = gfc_alloca (width);
  if (wi == -1) /* '(A)' edit descriptor  */
    wi = length;
-  w = wi;
+  status = read_block_form (dtp, s, &width);
  s = gfc_alloca (w);
  /* Read in w bytes, treating comma as not a separator.  */
  dtp->u.p.sf_read_comma = 0;
  status = read_block_form (dtp, s, &w);
  dtp->u.p.sf_read_comma =
    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
  if (status == FAILURE)
    return;
-  if (w > (size_t) length)
+  if (width > (size_t) len)
-     s += (w - length);
+     s += (width - len);
-  m = ((int) w > length) ? length : (int) w;
+  m = ((int) width > len) ? len : (int) width;
  dest = (gfc_char4_t *) p;
  for (n = 0; n < m; n++, dest++, s++)
    *dest = (unsigned char ) *s;
-  for (n = 0; n < length - (int) w; n++, dest++)
+  for (n = 0; n < len - (int) width; n++, dest++)
    *dest = (unsigned char) ' ';
 }
 /* read_a()-- Read a character record into a KIND=1 character destination,
   processing UTF-8 encoding if necessary.  */
 void
 read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length)
 {
  int wi;
  size_t w;
  wi = f->u.w;
  if (wi == -1) /* '(A)' edit descriptor  */
    wi = length;
  w = wi;
  /* Read in w characters, treating comma as not a separator.  */
  dtp->u.p.sf_read_comma = 0;
  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
    read_utf8_char1 (dtp, p, length, w);
  else
    read_default_char1 (dtp, p, length, w);
  dtp->u.p.sf_read_comma =
    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
 }
 /* read_a_char4()-- Read a character record into a KIND=4 character destination,
   processing UTF-8 encoding if necessary.  */
 void
 read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
 {
  int wi;
  size_t w;
  wi = f->u.w;
  if (wi == -1) /* '(A)' edit descriptor  */
    wi = length;
  w = wi;
  /* Read in w characters, treating comma as not a separator.  */
  dtp->u.p.sf_read_comma = 0;
  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
    read_utf8_char4 (dtp, p, length, w);
  else
    read_default_char4 (dtp, p, length, w);
  dtp->u.p.sf_read_comma =
    dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
 }
 /* eat_leading_spaces()-- Given a character pointer and a width,
 * ignore the leading spaces.  */
--- a/libgfortran/io/write.c
+++ b/libgfortran/io/write.c
@ -36,10 +36,161 @@ Boston, MA 02110-1301, USA.  */
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <errno.h>
 #define star_fill(p, n) memset(p, '*', n)
 #include "write_float.def"
 typedef unsigned char uchar;
 /* Write out default char4.  */
 static void
 write_default_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
 		     int src_len, int w_len)
 {
  char *p;
  int j, k = 0;
  gfc_char4_t c;
  uchar d;
  /* Take care of preceding blanks.  */
  if (w_len > src_len)
    {
      k = w_len - src_len;
      p = write_block (dtp, k);
      if (p == NULL)
 	return;
      memset (p, ' ', k);
    }
  /* Get ready to handle delimiters if needed.  */
  switch (dtp->u.p.delim_status)
    {
    case DELIM_APOSTROPHE:
      d = '\'';
      break;
    case DELIM_QUOTE:
      d = '"';
      break;
    default:
      d = ' ';
      break;
    }
  /* Now process the remaining characters, one at a time.  */
  for (j = k; j < src_len; j++)
    {
      c = source[j];
      /* Handle delimiters if any.  */
      if (c == d && d != ' ')
 	{
 	  p = write_block (dtp, 2);
 	  if (p == NULL)
 	    return;
 	  *p++ = (uchar) c;
 	}
      else
 	{
 	  p = write_block (dtp, 1);
 	  if (p == NULL)
 	    return;
 	}
      *p = c > 255 ? '?' : (uchar) c;
    }
 }
 /* Write out UTF-8 converted from char4.  */
 static void
 write_utf8_char4 (st_parameter_dt *dtp, gfc_char4_t *source,
 		     int src_len, int w_len)
 {
  char *p;
  int j, k = 0;
  gfc_char4_t c;
  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
  size_t nbytes;
  uchar buf[6], d, *q; 
  /* Take care of preceding blanks.  */
  if (w_len > src_len)
    {
      k = w_len - src_len;
      p = write_block (dtp, k);
      if (p == NULL)
 	return;
      memset (p, ' ', k);
    }
  /* Get ready to handle delimiters if needed.  */
  switch (dtp->u.p.delim_status)
    {
    case DELIM_APOSTROPHE:
      d = '\'';
      break;
    case DELIM_QUOTE:
      d = '"';
      break;
    default:
      d = ' ';
      break;
    }
  /* Now process the remaining characters, one at a time.  */
  for (j = k; j < src_len; j++)
    {
      c = source[j];
      if (c < 0x80)
 	{
 	  /* Handle the delimiters if any.  */
 	  if (c == d && d != ' ')
 	    {
 	      p = write_block (dtp, 2);
 	      if (p == NULL)
 		return;
 	      *p++ = (uchar) c;
 	    }
 	  else
 	    {
 	      p = write_block (dtp, 1);
 	      if (p == NULL)
 		return;
 	    }
 	  *p = (uchar) c;
 	}
      else
 	{
 	  /* Convert to UTF-8 sequence.  */
 	  nbytes = 1;
 	  q = &buf[6];
 	  do
 	    {
 	      *--q = ((c & 0x3F) | 0x80);
 	      c >>= 6;
 	      nbytes++;
 	    }
 	  while (c >= 0x3F || (c & limits[nbytes-1]));
 	  *--q = (c | masks[nbytes-1]);
 	  p = write_block (dtp, nbytes);
 	  if (p == NULL)
 	    return;
 	  while (q < &buf[6])
 	    *p++ = *q++;
 	}
    }
 }
 void
 write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
 {
@ -126,17 +277,16 @@ write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
 /* The primary difference between write_a_char4 and write_a is that we have to
-   deal with writing from the first byte of the 4-byte character and take care
+   deal with writing from the first byte of the 4-byte character and pay
-   of endianess.  This currently implements encoding="default" which means we
+   attention to the most significant bytes.  For ENCODING="default" write the
-   write the lowest significant byte. If the 3 most significant bytes are
+   lowest significant byte. If the 3 most significant bytes contain
-   not representable emit a '?'.  TODO: Implement encoding="UTF-8"
+   non-zero values, emit a '?'.  For ENCODING="utf-8", convert the UCS-32 value
-   which will process all 4 bytes and translate to the encoded output.  */
+   to the UTF-8 encoded string before writing out.  */
 void
 write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len)
 {
  int wlen;
  char *p;
  gfc_char4_t *q;
  wlen = f->u.string.length < 0
@ -173,19 +323,15 @@ write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len
 	      /* Write out the previously scanned characters in the string.  */
 	      if (bytes > 0)
 		{
-		  p = write_block (dtp, bytes);
+		  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
-		  if (p == NULL)
+		    write_utf8_char4 (dtp, q, bytes, 0);
-		    return;
+		  else
-		  for (j = 0; j < bytes; j++)
+		    write_default_char4 (dtp, q, bytes, 0);
 		    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
 		  bytes = 0;
 		}
 	      /* Write out the CR_LF sequence.  */ 
-	      p = write_block (dtp, 2);
+	      write_default_char4 (dtp, crlf, 2, 0);
              if (p == NULL)
                return;
 	      memcpy (p, crlf, 2);
 	    }
 	  else
 	    bytes++;
@ -194,32 +340,19 @@ write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len
      /*  Write out any remaining bytes if no LF was found.  */
      if (bytes > 0)
 	{
-	  p = write_block (dtp, bytes);
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
-	  if (p == NULL)
+	    write_utf8_char4 (dtp, q, bytes, 0);
-	    return;
+	  else
-	  for (j = 0; j < bytes; j++)
+	    write_default_char4 (dtp, q, bytes, 0);
 	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
 	}
    }
  else
    {
 #endif
-      int j;
+      if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
-      p = write_block (dtp, wlen);
+	write_utf8_char4 (dtp, q, len, wlen);
      if (p == NULL)
 	return;
      if (wlen < len)
 	{
 	  for (j = 0; j < wlen; j++)
 	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
 	}
      else
-	{
+	write_default_char4 (dtp, q, len, wlen);
 	  memset (p, ' ', wlen - len);
 	  for (j = wlen - len; j < wlen; j++)
 	    p[j] = q[j] > 255 ? '?' : (unsigned char) q[j];
 	}
 #ifdef HAVE_CRLF
    }
 #endif
@ -745,8 +878,6 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length)
 {
  int i, extra;
  char *p, d;
  gfc_char4_t *q;
  switch (dtp->u.p.delim_status)
    {
@ -769,9 +900,9 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length)
 	{
 	  extra = 2;
-	    for (i = 0; i < length; i++)
+	  for (i = 0; i < length; i++)
-	      if (source[i] == d)
+	    if (source[i] == d)
-		extra++;
+	      extra++;
 	}
      p = write_block (dtp, length + extra);
@ -796,40 +927,24 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length)
    }
  else
    {
      /* We have to scan the source string looking for delimiters to determine
 	 how large the write block needs to be.  */
      if (d == ' ')
 	extra = 0;
      else
 	{
 	  extra = 2;
 	  q = (gfc_char4_t *) source;
 	  for (i = 0; i < length; i++, q++)
 	    if (*q == (gfc_char4_t) d)
 	      extra++;
 	}
      p = write_block (dtp, length + extra);
      if (p == NULL)
 	return;
      if (d == ' ')
 	{
-	  q = (gfc_char4_t *) source;
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
-	  for (i = 0; i < length; i++, q++)
+	    write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
-	    p[i] = *q > 255 ? '?' : (unsigned char) *q;
+	  else
 	    write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
 	}
      else
 	{
-	  *p++ = d;
+	  p = write_block (dtp, 1);
-	  q = (gfc_char4_t *) source;
+	  *p = d;
-	  for (i = 0; i < length; i++, q++)
+
-	    {
+	  if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
-	      *p++ = *q > 255 ? '?' : (unsigned char) *q;
+	    write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0);
-	      if (*q == (gfc_char4_t) d)
+	  else
-		*p++ = d;
+	    write_default_char4 (dtp, (gfc_char4_t *) source, length, 0);
-	    }
+
 	  p = write_block (dtp, 1);
 	  *p = d;
 	}
    }