update to Unicode 5

SVN=126184

update to Unicode 5
SVN=126184
5b904a3b · Rob Pike · 0d079a53 · 5b904a3b · 5b904a3b · 5b904a3b
Commit 5b904a3b authored Jul 07, 2008 by Rob Pike
24 changed files
--- a/src/lib/math/asin.go
+++ b/src/lib/math/asin.go
@@ -34,7 +34,7 @@ asin(arg double)double
 		sign = true;
 	}
 	if arg > 1 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}

 	temp = sqrt(1 - x*x);
@@ -54,7 +54,7 @@ func
 acos(arg double)double
 {
 	if(arg > 1 || arg < -1) {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}
 	return pio2 - asin(arg);
 }
--- a/src/lib/math/exp.go
+++ b/src/lib/math/exp.go
@@ -40,7 +40,7 @@ exp(arg double) double
 		return 0.;
 	}
 	if arg > maxf {
-		panic "return sys.Inf(1)"
+		return sys.Inf(1)
 	}

 	x = arg*log2e;

--- a/src/lib/math/log.go
+++ b/src/lib/math/log.go
@@ -36,7 +36,7 @@ log(arg double) double
 	var exp int;

 	if arg <= 0 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}

 	exp,x = sys.frexp(arg);
@@ -63,7 +63,7 @@ log10(arg double) double
 {

 	if arg <= 0 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}
 	return log(arg) * ln10o1;
 }
--- a/src/lib/math/main.go
+++ b/src/lib/math/main.go
@@ -5,7 +5,25 @@

 package main

-import math "math"
+//import math "math"
+//////////////////
+ import math "asin"
+ import math "atan"
+ import math "atan2"
+ import math "exp"
+ import math "fabs"
+ import math "floor"
+ import math "fmod"
+ import math "hypot"
+ import math "log"
+ import math "pow"
+ import math "pow10"
+ import math "sin"
+ import math "sinh"
+ import math "sqrt"
+ import math "tan"
+ import math "tanh"
+

 const
 (

--- a/src/lib/math/pow.go
+++ b/src/lib/math/pow.go
@@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
 	if arg1 <= 0 {
 		if(arg1 == 0) {
 			if arg2 <= 0 {
-				panic "return sys.NaN()";
+				return sys.NaN();
 			}
 			return 0;
 		}

 		temp = floor(arg2);
 		if temp != arg2 {
-			panic "return sys.NaN()";
+			panic sys.NaN();
 		}

 		l = long(temp);

--- a/src/lib/math/sinh.go
+++ b/src/lib/math/sinh.go
@@ -48,7 +48,7 @@ sinh(arg double) double
 		temp = exp(arg)/2;

 	case arg > 0.5:
-//		temp = (exp(arg) - exp(-arg))/2;
+		temp = (exp(arg) - exp(-arg))/2;

 	default:
 		argsq = arg*arg;
@@ -71,5 +71,5 @@ cosh(arg double) double
 	if arg > 21 {
 		return exp(arg)/2;
 	}
-//	return (exp(arg) + exp(-arg))/2;
+	return (exp(arg) + exp(-arg))/2;
 }
--- a/src/lib/math/sqrt.go
+++ b/src/lib/math/sqrt.go
@@ -19,11 +19,10 @@ sqrt(arg double) double
 	var x, temp double;
 	var exp, i int;

-/* BUG: NO isINF
 	if sys.isInf(arg, 1) {
 		return arg;
 	}
-*/
+
 	if arg <= 0 {
 		if arg < 0 {
 			panic "return sys.NaN()"

--- a/src/lib/math/tan.go
+++ b/src/lib/math/tan.go
@@ -62,7 +62,7 @@ tan(arg double) double

 	if flag {
 		if(temp == 0) {
-			panic "return sys.NaN()";
+			panic sys.NaN();
 		}
 		temp = 1/temp;
 	}

--- a/src/lib9/utf/mkrunetype.c
+++ b/src/lib9/utf/mkrunetype.c
--- a/src/lib9/utf/rune.c
+++ b/src/lib9/utf/rune.c
 /*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
+ *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 enum
 {
@@ -23,27 +24,150 @@ enum
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,

 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */

 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */

 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */

-	Bad	= Runeerror
+	Bad	= Runeerror,
 };

+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune
+ * that works on strings that are not necessarily null-terminated.
+ *
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
 int
-chartorune(Rune *rune, char *str)
+charntorune(Rune *rune, const char *str, int length)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
+	long l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+	int c, c1, c2, c3;
 	long l;

 	/*
@@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
 		return 3;
 	}

+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	/*
+	 * Support for 5-byte or longer UTF-8 would go here, but
+	 * since we don't have that, we'll just fall through to bad.
+	 */
+
 	/*
 	 * bad decoding
 	 */
@@ -97,9 +241,16 @@ bad:
 }

 int
-runetochar(char *str, Rune *rune)
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+	*consumed = charntorune(rune, str, length);
+	return *rune != Runeerror || *consumed == 3;
+}
+
+int
+runetochar(char *str, const Rune *rune)
 {
-	long c;
+	/* Runes are signed, so convert to unsigned for range check. */
+	unsigned long c;

 	/*
 	 * one character sequence
@@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
 		return 2;
 	}

+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
+	if (c <= Rune3) {
 		str[0] = T3 |  (c >> 2*Bitx);
 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
 		str[2] = Tx |  (c & Maskx);
 		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
 }

 int
-runelen(long c)
+runelen(Rune rune)
 {
-	Rune rune;
 	char str[10];

-	rune = c;
 	return runetochar(str, &rune);
 }

 int
-runenlen(Rune *r, int nrune)
+runenlen(const Rune *r, int nrune)
 {
 	int nb, c;

 	nb = 0;
 	while(nrune--) {
 		c = *r++;
-		if(c <= Rune1)
+		if (c <= Rune1)
 			nb++;
-		else
-		if(c <= Rune2)
+		else if (c <= Rune2)
 			nb += 2;
-		else
+		else if (c <= Rune3)
 			nb += 3;
+		else /* assert(c <= Rune4) */
+			nb += 4;
 	}
 	return nb;
 }

 int
-fullrune(char *str, int n)
+fullrune(const char *str, int n)
 {
-	int c;
-
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
+	if (n > 0) {
+		int c = *(uchar*)str;
+		if (c < Tx)
 			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
+		if (n > 1) {
+			if (c < T3)
 				return 1;
+			if (n > 2) {
+				if (c < T4 || n > 3)
+					return 1;
+			}
+		}
 	}
 	return 0;
 }
--- a/src/lib9/utf/runetype.c
+++ b/src/lib9/utf/runetype.c
--- a/src/lib9/utf/utf.h
+++ b/src/lib9/utf/utf.h
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 1998-2002 by Lucent Technologies.
+ *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+#ifndef _UTFH_
+#define _UTFH_ 1
+
+#include <stdint.h>
+
+typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+  UTFmax	= 4,		/* maximum bytes per rune */
+  Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
+  Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
+  Runeerror	= 0xFFFD,	/* decoding error in UTF */
+  Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+*/
+
+// runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int runetochar(char* s, const Rune* r);
+
+
+// chartorune copies (decodes) at most UTFmax bytes starting at s to
+// one rune, pointed to by r, and returns the number of bytes consumed.
+// If the input is not exactly in UTF format, chartorune will set *r
+// to Runeerror and return 1.
+//
+// Note: There is no special case for a "null-terminated" string. A
+// string whose first byte has the value 0 is the UTF8 encoding of the
+// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
+// anywhere else in a UTF sequence.
+
+int chartorune(Rune* r, const char* s);
+
+
+// charntorune is like chartorune, except that it will access at most
+// n bytes of s.  If the UTF sequence is incomplete within n bytes,
+// charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+// 
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int charntorune(Rune* r, const char* s, int n);
+
+// isvalidcharntorune(str, n, r, consumed)
+// is a convenience function that calls "*consumed = charntorune(r, str, n)"
+// and returns an int (logically boolean) indicating whether the first
+// n bytes of str was a valid and complete UTF sequence.
+
+int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
+
+// runelen returns the number of bytes required to convert r into UTF.
+
+int runelen(Rune r);
+
+
+// runenlen returns the number of bytes required to convert the n
+// runes pointed to by r into UTF.
+
+int runenlen(const Rune* r, int n);
+
+
+// fullrune returns 1 if the string s of length n is long enough to be
+// decoded by chartorune, and 0 otherwise. This does not guarantee
+// that the string contains a legal UTF encoding. This routine is used
+// by programs that obtain input one byte at a time and need to know
+// when a full rune has arrived.
+
+int fullrune(const char* s, int n);
+
+// The following routines are analogous to the corresponding string
+// routines with "utf" substituted for "str", and "rune" substituted
+// for "chr".
+
+// utflen returns the number of runes that are represented by the UTF
+// string s. (cf. strlen)
+
+int utflen(const char* s);
+
+
+// utfnlen returns the number of complete runes that are represented
+// by the first n bytes of the UTF string s. If the last few bytes of
+// the string contain an incompletely coded rune, utfnlen will not
+// count them; in this way, it differs from utflen, which includes
+// every byte of the string. (cf. strnlen)
+
+int utfnlen(const char* s, long n);
+
+
+// utfrune returns a pointer to the first occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strchr)
+
+const char* utfrune(const char* s, Rune r);
+
+
+// utfrrune returns a pointer to the last occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strrchr)
+
+const char* utfrrune(const char* s, Rune r);
+
+
+// utfutf returns a pointer to the first occurrence of the UTF string
+// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
+// null string, utfutf returns s1. (cf. strstr)
+
+const char* utfutf(const char* s1, const char* s2);
+
+
+// utfecpy copies UTF sequences until a null sequence has been copied,
+// but writes no sequences beyond es1.  If any sequences are copied,
+// s1 is terminated by a null sequence, and a pointer to that sequence
+// is returned.  Otherwise, the original s1 is returned. (cf. strecpy)
+
+char* utfecpy(char *s1, char *es1, const char *s2);
+
+
+
+// These functions are rune-string analogues of the corresponding
+// functions in strcat (3).
+// 
+// These routines first appeared in Plan 9.
+// SEE ALSO
+// memmove (3)
+// rune (3)
+// strcat (2)
+//
+// BUGS: The outcome of overlapping moves varies among implementations.
+
+Rune* runestrcat(Rune* s1, const Rune* s2);
+Rune* runestrncat(Rune* s1, const Rune* s2, long n);
+
+const Rune* runestrchr(const Rune* s, Rune c);
+
+int runestrcmp(const Rune* s1, const Rune* s2);
+int runestrncmp(const Rune* s1, const Rune* s2, long n);
+
+Rune* runestrcpy(Rune* s1, const Rune* s2);
+Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
+Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
+
+Rune* runestrdup(const Rune* s);
+
+const Rune* runestrrchr(const Rune* s, Rune c);
+long runestrlen(const Rune* s);
+const Rune* runestrstr(const Rune* s1, const Rune* s2);
+
+
+
+// The following routines test types and modify cases for Unicode
+// characters.  Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title.  Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on.  Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines.  These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so the boolean functions
+// (e.g., isupperrune) return 0 for false and 1 for true.
+//
+//
+// toupperrune, tolowerrune, and totitlerune are the Unicode case
+// mappings. These routines return the character unchanged if it has
+// no defined mapping.
+
+Rune toupperrune(Rune r);
+Rune tolowerrune(Rune r);
+Rune totitlerune(Rune r);
+
+
+// isupperrune tests for upper case characters, including Unicode
+// upper case letters and targets of the toupper mapping. islowerrune
+// and istitlerune are defined analogously. 
+ 
+int isupperrune(Rune r);
+int islowerrune(Rune r);
+int istitlerune(Rune r);
+
+
+// isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int isalpharune(Rune r);
+
+
+// isdigitrune tests for digits. Non-digit numbers, such as Roman
+// numerals, are not included.
+
+int isdigitrune(Rune r);
+
+
+// isideographicrune tests for ideographic characters and numbers, as
+// defined by the Unicode standard.
+
+int isideographicrune(Rune r);
+
+
+// isspacerune tests for whitespace characters, including "C" locale
+// whitespace, Unicode defined whitespace, and the "zero-width
+// non-break space" character.
+
+int isspacerune(Rune r);
+
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
--- a/src/lib9/utf/utfdef.h
+++ b/src/lib9/utf/utfdef.h
@@ -12,36 +12,17 @@
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */

-/*
- * compiler directive on Plan 9
- */
-#ifndef USED
-#define USED(x) if(x);else
-#endif
+#define uchar _utfuchar
+#define ushort _utfushort
+#define uint _utfuint
+#define ulong _utfulong
+#define vlong _utfvlong
+#define uvlong _utfuvlong

-/*
- * easiest way to make sure these are defined
- */
-#define uchar	_fmtuchar
-#define ushort	_fmtushort
-#define uint	_fmtuint
-#define ulong	_fmtulong
-#define vlong	_fmtvlong
-#define uvlong	_fmtuvlong
 typedef unsigned char		uchar;
 typedef unsigned short		ushort;
 typedef unsigned int		uint;
 typedef unsigned long		ulong;
-typedef unsigned long long	uvlong;
-typedef long long		vlong;
-
-/*
- * nil cannot be ((void*)0) on ANSI C,
- * because it is used for function pointers
- */
-#undef	nil
-#define	nil	0
-
-#undef	nelem
-#define	nelem	((void*)0)

+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+#define nil ((void*)0)
--- a/src/lib9/utf/utfecpy.c
+++ b/src/lib9/utf/utfecpy.c
@@ -7,18 +7,17 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
-#define _BSD_SOURCE 1	/* memccpy */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 char*
-utfecpy(char *to, char *e, char *from)
+utfecpy(char *to, char *e, const char *from)
 {
 	char *end;


--- a/src/lib9/utf/utflen.c
+++ b/src/lib9/utf/utflen.c
@@ -7,17 +7,17 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 int
-utflen(char *s)
+utflen(const char *s)
 {
 	int c;
 	long n;
@@ -34,4 +34,5 @@ utflen(char *s)
 			s += chartorune(&rune, s);
 		n++;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfnlen.c
+++ b/src/lib9/utf/utfnlen.c
@@ -7,22 +7,22 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 int
-utfnlen(char *s, long m)
+utfnlen(const char *s, long m)
 {
 	int c;
 	long n;
 	Rune rune;
-	char *es;
+	const char *es;

 	es = s + m;
 	for(n = 0; s < es; n++) {

--- a/src/lib9/utf/utfrrune.c
+++ b/src/lib9/utf/utfrrune.c
@@ -7,21 +7,22 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

+const
 char*
-utfrrune(char *s, long c)
+utfrrune(const char *s, Rune c)
 {
 	long c1;
 	Rune r;
-	char *s1;
+	const char *s1;

 	if(c < Runesync)		/* not part of utf sequence */
 		return strrchr(s, c);
@@ -42,4 +43,5 @@ utfrrune(char *s, long c)
 			s1 = s;
 		s += c1;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfrune.c
+++ b/src/lib9/utf/utfrune.c
@@ -7,17 +7,18 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

+const
 char*
-utfrune(char *s, long c)
+utfrune(const char *s, Rune c)
 {
 	long c1;
 	Rune r;
@@ -41,4 +42,5 @@ utfrune(char *s, long c)
 			return s;
 		s += n;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfutf.c
+++ b/src/lib9/utf/utfutf.c
@@ -7,24 +7,25 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"


 /*
 * Return pointer to first occurrence of s2 in s1,
 * 0 if none
 */
+const
 char*
-utfutf(char *s1, char *s2)
+utfutf(const char *s1, const char *s2)
 {
-	char *p;
+	const char *p;
 	long f, n1, n2;
 	Rune r;

@@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
 		return strstr(s1, s2);

 	n2 = strlen(s2);
-	for(p=s1; p=utfrune(p, f); p+=n1)
+	for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
 		if(strncmp(p, s2, n2) == 0)
 			return p;
 	return 0;

--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@@ -20,6 +20,7 @@ LIBOFILES=\
 	runtime.$O\
 	map.$O\
 	print.$O\
+	rune.$O\
 	string.$O\
 	sys_file.$O\


--- a/src/runtime/rune.c
+++ b/src/runtime/rune.c
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+/*
+ * This code is copied, with slight editing due to type differences,
+ * from a subset of ../lib9/utf/rune.c
+ */
+
+#include "runtime.h"
+
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2, 
+
+	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+
+	Runeerror	= 0xFFFD,
+	Runeself	= 0x80,
+
+	Bad	= Runeerror,
+	
+	Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune 
+ * that works on strings that are not necessarily null-terminated.
+ * 
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int32
+charntorune(int32 *rune, byte *str, int32 length)
+{
+	int32 c, c1, c2, c3;
+	int32 l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(byte*)str;  /* cast not necessary, but kept for safety */
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(byte*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(byte*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(byte*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+int32
+runetochar(byte *str, int32 rune)  /* note: in original, arg2 was pointer */
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	uint32 c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -85,6 +85,8 @@ enum
 int32 strcmp(byte*, byte*);
 int32 findnull(int8*);
 void	dump(byte*, int32);
+int32 runetochar(byte*, int32);
+int32 chartorune(uint32*, byte*);

 extern string	emptystring;
 extern int32 debug;

--- a/src/runtime/string.c
+++ b/src/runtime/string.c
@@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
 	FLUSH(&b);
 }

-/*
- * this is the plan9 runetochar
- * extended for 36 bits in 7 bytes
- * note that it truncates to 32 bits
- * through the argument passing.
- */
-static int32
-runetochar(byte *str, uint32 c)
-{
-	int32 i, n;
-	uint32 mask, mark;
-
-	/*
-	 * one character in 7 bits
-	 */
-	if(c <= 0x07FUL) {
-		str[0] = c;
-		return 1;
-	}
-
-	/*
-	 * every new character picks up 5 bits
-	 * one less in the first byte and
-	 * six more in an extension byte
-	 */
-	mask = 0x7ffUL;
-	mark = 0xC0UL;
-	for(n=1;; n++) {
-		if(c <= mask)
-			break;
-		mask = (mask<<5) | 0x1fUL;
-		mark = (mark>>1) | 0x80UL;
-	}
-
-	/*
-	 * lay down the bytes backwards
-	 * n is the number of extension bytes
-	 * mask is the max codepoint
-	 * mark is the zeroth byte indicator
-	 */
-	for(i=n; i>0; i--) {
-		str[i] = 0x80UL | (c&0x3fUL);
-		c >>= 6;
-	}
-
-	str[0] = mark|c;
-	return n+1;
-}
-
 void
 sys·intstring(int64 v, string s)
 {

--- a/test/string_lit.go
+++ b/test/string_lit.go
@@ -75,5 +75,14 @@ func main() {
 	       `\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
           "backslashes 2 (backquote)");
 	assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
+
+	// test large runes. perhaps not the most logical place for this test.
+	var r int32;
+	r = 0x10ffff;	// largest rune value
+	s = string(r);
+	assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
+	r = 0x10ffff + 1;
+	s = string(r);
+	assert(s, "\xef\xbf\xbd", "too-large rune");
 	sys.exit(ecode);
 }