Commit 5b904a3b authored by Rob Pike's avatar Rob Pike

update to Unicode 5

SVN=126184
parent 0d079a53
......@@ -34,7 +34,7 @@ asin(arg double)double
sign = true;
}
if arg > 1 {
panic "return sys.NaN()";
return sys.NaN();
}
temp = sqrt(1 - x*x);
......@@ -54,7 +54,7 @@ func
acos(arg double)double
{
if(arg > 1 || arg < -1) {
panic "return sys.NaN()";
return sys.NaN();
}
return pio2 - asin(arg);
}
......@@ -40,7 +40,7 @@ exp(arg double) double
return 0.;
}
if arg > maxf {
panic "return sys.Inf(1)"
return sys.Inf(1)
}
x = arg*log2e;
......
......@@ -36,7 +36,7 @@ log(arg double) double
var exp int;
if arg <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
exp,x = sys.frexp(arg);
......@@ -63,7 +63,7 @@ log10(arg double) double
{
if arg <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
return log(arg) * ln10o1;
}
......@@ -5,7 +5,25 @@
package main
import math "math"
//import math "math"
//////////////////
import math "asin"
import math "atan"
import math "atan2"
import math "exp"
import math "fabs"
import math "floor"
import math "fmod"
import math "hypot"
import math "log"
import math "pow"
import math "pow10"
import math "sin"
import math "sinh"
import math "sqrt"
import math "tan"
import math "tanh"
const
(
......
......@@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
if arg1 <= 0 {
if(arg1 == 0) {
if arg2 <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
return 0;
}
temp = floor(arg2);
if temp != arg2 {
panic "return sys.NaN()";
panic sys.NaN();
}
l = long(temp);
......
......@@ -48,7 +48,7 @@ sinh(arg double) double
temp = exp(arg)/2;
case arg > 0.5:
// temp = (exp(arg) - exp(-arg))/2;
temp = (exp(arg) - exp(-arg))/2;
default:
argsq = arg*arg;
......@@ -71,5 +71,5 @@ cosh(arg double) double
if arg > 21 {
return exp(arg)/2;
}
// return (exp(arg) + exp(-arg))/2;
return (exp(arg) + exp(-arg))/2;
}
......@@ -19,11 +19,10 @@ sqrt(arg double) double
var x, temp double;
var exp, i int;
/* BUG: NO isINF
if sys.isInf(arg, 1) {
return arg;
}
*/
if arg <= 0 {
if arg < 0 {
panic "return sys.NaN()"
......
......@@ -62,7 +62,7 @@ tan(arg double) double
if flag {
if(temp == 0) {
panic "return sys.NaN()";
panic sys.NaN();
}
temp = 1/temp;
}
......
This diff is collapsed.
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
enum
{
......@@ -23,27 +24,150 @@ enum
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror
Bad = Runeerror,
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int
chartorune(Rune *rune, char *str)
charntorune(Rune *rune, const char *str, int length)
{
int c, c1, c2;
int c, c1, c2, c3;
long l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
/*
* This is the older "unsafe" version, which works fine on
* null-terminated strings.
*/
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
......@@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
......@@ -97,9 +241,16 @@ bad:
}
int
runetochar(char *str, Rune *rune)
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
int
runetochar(char *str, const Rune *rune)
{
long c;
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
......@@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(long c)
runelen(Rune rune)
{
Rune rune;
char str[10];
rune = c;
return runetochar(str, &rune);
}
int
runenlen(Rune *r, int nrune)
runenlen(const Rune *r, int nrune)
{
int nb, c;
nb = 0;
while(nrune--) {
c = *r++;
if(c <= Rune1)
if (c <= Rune1)
nb++;
else
if(c <= Rune2)
else if (c <= Rune2)
nb += 2;
else
else if (c <= Rune3)
nb += 3;
else /* assert(c <= Rune4) */
nb += 4;
}
return nb;
}
int
fullrune(char *str, int n)
fullrune(const char *str, int n)
{
int c;
if(n > 0) {
c = *(uchar*)str;
if(c < Tx)
if (n > 0) {
int c = *(uchar*)str;
if (c < Tx)
return 1;
if(n > 1)
if(c < T3 || n > 2)
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}
This diff is collapsed.
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#ifndef _UTFH_
#define _UTFH_ 1
#include <stdint.h>
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
#ifdef __cplusplus
extern "C" {
#endif
/*
* rune routines
*/
/*
* These routines were written by Rob Pike and Ken Thompson
* and first appeared in Plan 9.
* SEE ALSO
* utf (7)
* tcs (1)
*/
// runetochar copies (encodes) one rune, pointed to by r, to at most
// UTFmax bytes starting at s and returns the number of bytes generated.
int runetochar(char* s, const Rune* r);
// chartorune copies (decodes) at most UTFmax bytes starting at s to
// one rune, pointed to by r, and returns the number of bytes consumed.
// If the input is not exactly in UTF format, chartorune will set *r
// to Runeerror and return 1.
//
// Note: There is no special case for a "null-terminated" string. A
// string whose first byte has the value 0 is the UTF8 encoding of the
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
// anywhere else in a UTF sequence.
int chartorune(Rune* r, const char* s);
// charntorune is like chartorune, except that it will access at most
// n bytes of s. If the UTF sequence is incomplete within n bytes,
// charntorune will set *r to Runeerror and return 0. If it is complete
// but not in UTF format, it will set *r to Runeerror and return 1.
//
// Added 2004-09-24 by Wei-Hwa Huang
int charntorune(Rune* r, const char* s, int n);
// isvalidcharntorune(str, n, r, consumed)
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
// and returns an int (logically boolean) indicating whether the first
// n bytes of str was a valid and complete UTF sequence.
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
// runelen returns the number of bytes required to convert r into UTF.
int runelen(Rune r);
// runenlen returns the number of bytes required to convert the n
// runes pointed to by r into UTF.
int runenlen(const Rune* r, int n);
// fullrune returns 1 if the string s of length n is long enough to be
// decoded by chartorune, and 0 otherwise. This does not guarantee
// that the string contains a legal UTF encoding. This routine is used
// by programs that obtain input one byte at a time and need to know
// when a full rune has arrived.
int fullrune(const char* s, int n);
// The following routines are analogous to the corresponding string
// routines with "utf" substituted for "str", and "rune" substituted
// for "chr".
// utflen returns the number of runes that are represented by the UTF
// string s. (cf. strlen)
int utflen(const char* s);
// utfnlen returns the number of complete runes that are represented
// by the first n bytes of the UTF string s. If the last few bytes of
// the string contain an incompletely coded rune, utfnlen will not
// count them; in this way, it differs from utflen, which includes
// every byte of the string. (cf. strnlen)
int utfnlen(const char* s, long n);
// utfrune returns a pointer to the first occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strchr)
const char* utfrune(const char* s, Rune r);
// utfrrune returns a pointer to the last occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strrchr)
const char* utfrrune(const char* s, Rune r);
// utfutf returns a pointer to the first occurrence of the UTF string
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
// null string, utfutf returns s1. (cf. strstr)
const char* utfutf(const char* s1, const char* s2);
// utfecpy copies UTF sequences until a null sequence has been copied,
// but writes no sequences beyond es1. If any sequences are copied,
// s1 is terminated by a null sequence, and a pointer to that sequence
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
char* utfecpy(char *s1, char *es1, const char *s2);
// These functions are rune-string analogues of the corresponding
// functions in strcat (3).
//
// These routines first appeared in Plan 9.
// SEE ALSO
// memmove (3)
// rune (3)
// strcat (2)
//
// BUGS: The outcome of overlapping moves varies among implementations.
Rune* runestrcat(Rune* s1, const Rune* s2);
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
const Rune* runestrchr(const Rune* s, Rune c);
int runestrcmp(const Rune* s1, const Rune* s2);
int runestrncmp(const Rune* s1, const Rune* s2, long n);
Rune* runestrcpy(Rune* s1, const Rune* s2);
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
Rune* runestrdup(const Rune* s);
const Rune* runestrrchr(const Rune* s, Rune c);
long runestrlen(const Rune* s);
const Rune* runestrstr(const Rune* s1, const Rune* s2);
// The following routines test types and modify cases for Unicode
// characters. Unicode defines some characters as letters and
// specifies three cases: upper, lower, and title. Mappings among the
// cases are also defined, although they are not exhaustive: some
// upper case letters have no lower case mapping, and so on. Unicode
// also defines several character properties, a subset of which are
// checked by these routines. These routines are based on Unicode
// version 3.0.0.
//
// NOTE: The routines are implemented in C, so the boolean functions
// (e.g., isupperrune) return 0 for false and 1 for true.
//
//
// toupperrune, tolowerrune, and totitlerune are the Unicode case
// mappings. These routines return the character unchanged if it has
// no defined mapping.
Rune toupperrune(Rune r);
Rune tolowerrune(Rune r);
Rune totitlerune(Rune r);
// isupperrune tests for upper case characters, including Unicode
// upper case letters and targets of the toupper mapping. islowerrune
// and istitlerune are defined analogously.
int isupperrune(Rune r);
int islowerrune(Rune r);
int istitlerune(Rune r);
// isalpharune tests for Unicode letters; this includes ideographs in
// addition to alphabetic characters.
int isalpharune(Rune r);
// isdigitrune tests for digits. Non-digit numbers, such as Roman
// numerals, are not included.
int isdigitrune(Rune r);
// isideographicrune tests for ideographic characters and numbers, as
// defined by the Unicode standard.
int isideographicrune(Rune r);
// isspacerune tests for whitespace characters, including "C" locale
// whitespace, Unicode defined whitespace, and the "zero-width
// non-break space" character.
int isspacerune(Rune r);
// (The comments in this file were copied from the manpage files rune.3,
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
// to conform to Google style. /JRM 11/11/05)
#ifdef __cplusplus
}
#endif
#endif
......@@ -12,36 +12,17 @@
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
/*
* compiler directive on Plan 9
*/
#ifndef USED
#define USED(x) if(x);else
#endif
#define uchar _utfuchar
#define ushort _utfushort
#define uint _utfuint
#define ulong _utfulong
#define vlong _utfvlong
#define uvlong _utfuvlong
/*
* easiest way to make sure these are defined
*/
#define uchar _fmtuchar
#define ushort _fmtushort
#define uint _fmtuint
#define ulong _fmtulong
#define vlong _fmtvlong
#define uvlong _fmtuvlong
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned long long uvlong;
typedef long long vlong;
/*
* nil cannot be ((void*)0) on ANSI C,
* because it is used for function pointers
*/
#undef nil
#define nil 0
#undef nelem
#define nelem ((void*)0)
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
#define nil ((void*)0)
......@@ -7,18 +7,17 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#define _BSD_SOURCE 1 /* memccpy */
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
char*
utfecpy(char *to, char *e, char *from)
utfecpy(char *to, char *e, const char *from)
{
char *end;
......
......@@ -7,17 +7,17 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
int
utflen(char *s)
utflen(const char *s)
{
int c;
long n;
......@@ -34,4 +34,5 @@ utflen(char *s)
s += chartorune(&rune, s);
n++;
}
return 0;
}
......@@ -7,22 +7,22 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
int
utfnlen(char *s, long m)
utfnlen(const char *s, long m)
{
int c;
long n;
Rune rune;
char *es;
const char *es;
es = s + m;
for(n = 0; s < es; n++) {
......
......@@ -7,21 +7,22 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
const
char*
utfrrune(char *s, long c)
utfrrune(const char *s, Rune c)
{
long c1;
Rune r;
char *s1;
const char *s1;
if(c < Runesync) /* not part of utf sequence */
return strrchr(s, c);
......@@ -42,4 +43,5 @@ utfrrune(char *s, long c)
s1 = s;
s += c1;
}
return 0;
}
......@@ -7,17 +7,18 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
const
char*
utfrune(char *s, long c)
utfrune(const char *s, Rune c)
{
long c1;
Rune r;
......@@ -41,4 +42,5 @@ utfrune(char *s, long c)
return s;
s += n;
}
return 0;
}
......@@ -7,24 +7,25 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
/*
* Return pointer to first occurrence of s2 in s1,
* 0 if none
*/
const
char*
utfutf(char *s1, char *s2)
utfutf(const char *s1, const char *s2)
{
char *p;
const char *p;
long f, n1, n2;
Rune r;
......@@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
return strstr(s1, s2);
n2 = strlen(s2);
for(p=s1; p=utfrune(p, f); p+=n1)
for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
if(strncmp(p, s2, n2) == 0)
return p;
return 0;
......
......@@ -20,6 +20,7 @@ LIBOFILES=\
runtime.$O\
map.$O\
print.$O\
rune.$O\
string.$O\
sys_file.$O\
......
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
/*
* This code is copied, with slight editing due to type differences,
* from a subset of ../lib9/utf/rune.c
*/
#include "runtime.h"
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Runeerror = 0xFFFD,
Runeself = 0x80,
Bad = Runeerror,
Runemax = 0x10FFFF, /* maximum rune value */
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int32
charntorune(int32 *rune, byte *str, int32 length)
{
int32 c, c1, c2, c3;
int32 l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(byte*)str; /* cast not necessary, but kept for safety */
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(byte*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(byte*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(byte*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
int32
runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */
{
/* Runes are signed, so convert to unsigned for range check. */
uint32 c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = rune;
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
......@@ -85,6 +85,8 @@ enum
int32 strcmp(byte*, byte*);
int32 findnull(int8*);
void dump(byte*, int32);
int32 runetochar(byte*, int32);
int32 chartorune(uint32*, byte*);
extern string emptystring;
extern int32 debug;
......
......@@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
FLUSH(&b);
}
/*
* this is the plan9 runetochar
* extended for 36 bits in 7 bytes
* note that it truncates to 32 bits
* through the argument passing.
*/
static int32
runetochar(byte *str, uint32 c)
{
int32 i, n;
uint32 mask, mark;
/*
* one character in 7 bits
*/
if(c <= 0x07FUL) {
str[0] = c;
return 1;
}
/*
* every new character picks up 5 bits
* one less in the first byte and
* six more in an extension byte
*/
mask = 0x7ffUL;
mark = 0xC0UL;
for(n=1;; n++) {
if(c <= mask)
break;
mask = (mask<<5) | 0x1fUL;
mark = (mark>>1) | 0x80UL;
}
/*
* lay down the bytes backwards
* n is the number of extension bytes
* mask is the max codepoint
* mark is the zeroth byte indicator
*/
for(i=n; i>0; i--) {
str[i] = 0x80UL | (c&0x3fUL);
c >>= 6;
}
str[0] = mark|c;
return n+1;
}
void
sys·intstring(int64 v, string s)
{
......
......@@ -75,5 +75,14 @@ func main() {
`\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
"backslashes 2 (backquote)");
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
// test large runes. perhaps not the most logical place for this test.
var r int32;
r = 0x10ffff; // largest rune value
s = string(r);
assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
r = 0x10ffff + 1;
s = string(r);
assert(s, "\xef\xbf\xbd", "too-large rune");
sys.exit(ecode);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment