charset: Added utf8_validate_char (factored out of utf8_validate).

12af7e37 · Joey Adams · 06c4af31 · 12af7e37 · 12af7e37
Commit 12af7e37 authored Jun 14, 2011 by Joey Adams
Hide whitespace changes
Inline Side-by-side

Showing with 82 additions and 52 deletions

ccan/charset/charset.c ccan/charset/charset.c +72 -52

ccan/charset/charset.h ccan/charset/charset.h +10 -0

No files found.
--- a/ccan/charset/charset.c
+++ b/ccan/charset/charset.c
@@ -22,6 +22,24 @@
 */

 #include "charset.h"
+#include <assert.h>
+
+
+bool utf8_validate(const char *str, size_t length)
+{
+	const char *s = str;
+	const char *e = str + length;
+	int len;
+	
+	for (; s < e; s += len) {
+		len = utf8_validate_char(s, e);
+		if (len == 0)
+			return false;
+	}
+	assert(s == e);
+	
+	return true;
+}

 /*
 * This function implements the syntax given in RFC3629, which is
@@ -37,68 +55,70 @@
 *  * The sixty-six Unicode "non-characters" are permitted
 *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
 */
-bool utf8_validate(const char *str, size_t length)
+int utf8_validate_char(const char *s, const char *e)
 {
-	const unsigned char *s = (const unsigned char*)str;
-	const unsigned char *e = s + length;
+	unsigned char c = *s++;
 	
-	while (s < e) {
-		unsigned char c = *s++;
-		unsigned char c2;
-		int len_minus_two;
+	if (c <= 0x7F) {        /* 00..7F */
+		return 1;
+	} else if (c <= 0xC1) { /* 80..C1 */
+		/* Disallow overlong 2-byte sequence. */
+		return 0;
+	} else if (c <= 0xDF) { /* C2..DF */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 1)
+			return 0;
 		
-		/* Validate the first byte and determine the sequence length. */
-		if (c <= 0x7F)          /* 00..7F */
-			continue;
-		else if (c <= 0xC1)     /* 80..C1 */
-			return false;
-		else if (c <= 0xDF)     /* C2..DF */
-			len_minus_two = 0;
-		else if (c <= 0xEF)     /* E0..EF */
-			len_minus_two = 1;
-		else if (c <= 0xF4)     /* F0..F4 */
-			len_minus_two = 2;
-		else
-			return false;
+		/* Make sure subsequent byte is in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
+		return 2;
+	} else if (c <= 0xEF) { /* E0..EF */
 		/* Make sure the character isn't clipped. */
-		if (s + len_minus_two >= e)
-			return false;
+		if (e - s < 2)
+			return 0;
+		
+		/* Disallow overlong 3-byte sequence. */
+		if (c == 0xE0 && (unsigned char)*s < 0xA0)
+			return 0;
+		
+		/* Disallow U+D800..U+DFFF. */
+		if (c == 0xED && (unsigned char)*s > 0x9F)
+			return 0;
+		
+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		c2 = *s;
+		return 3;
+	} else if (c <= 0xF4) { /* F0..F4 */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 3)
+			return 0;
+		
+		/* Disallow overlong 4-byte sequence. */
+		if (c == 0xF0 && (unsigned char)*s < 0x90)
+			return 0;
+		
+		/* Disallow codepoints beyond U+10FFFF. */
+		if (c == 0xF4 && (unsigned char)*s > 0x8F)
+			return 0;
 		
 		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
-		do {
-			if ((*s++ & 0xC0) != 0x80)
-				return false;
-		} while (len_minus_two--);
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		/* Handle special cases. */
-		switch (c) {
-			case 0xE0:
-				/* Disallow overlong 3-byte sequence. */
-				if (c2 < 0xA0)
-					return false;
-				break;
-			case 0xED:
-				/* Disallow U+D800..U+DFFF. */
-				if (c2 > 0x9F)
-					return false;
-				break;
-			case 0xF0:
-				/* Disallow overlong 4-byte sequence. */
-				if (c2 < 0x90)
-					return false;
-				break;
-			case 0xF4:
-				/* Disallow codepoints beyond U+10FFFF. */
-				if (c2 > 0x8F)
-					return false;
-				break;
-		}
+		return 4;
+	} else {                /* F5..FF */
+		return 0;
 	}
-	
-	return true;
 }

 int utf8_read_char(const char *s, uchar_t *out)

--- a/ccan/charset/charset.h
+++ b/ccan/charset/charset.h
@@ -42,6 +42,16 @@ typedef uint32_t uchar_t;
 */
 bool utf8_validate(const char *str, size_t length);

+/*
+ * Validate a single UTF-8 character.
+ * @s: Beginning of UTF-8 character.
+ * @e: End of string.
+ *
+ * If it's valid, return its length (1 thru 4).
+ * If it's invalid or clipped, return 0.
+ */
+int utf8_validate_char(const char *s, const char *e);
+
 /*
 * Read a single UTF-8 character starting at @s,
 * returning the length, in bytes, of the character read.