Commit 12af7e37 authored by Joey Adams's avatar Joey Adams

charset: Added utf8_validate_char (factored out of utf8_validate).

parent 06c4af31
...@@ -22,6 +22,24 @@ ...@@ -22,6 +22,24 @@
*/ */
#include "charset.h" #include "charset.h"
#include <assert.h>
bool utf8_validate(const char *str, size_t length)
{
const char *s = str;
const char *e = str + length;
int len;
for (; s < e; s += len) {
len = utf8_validate_char(s, e);
if (len == 0)
return false;
}
assert(s == e);
return true;
}
/* /*
* This function implements the syntax given in RFC3629, which is * This function implements the syntax given in RFC3629, which is
...@@ -37,68 +55,70 @@ ...@@ -37,68 +55,70 @@
* * The sixty-six Unicode "non-characters" are permitted * * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/ */
bool utf8_validate(const char *str, size_t length) int utf8_validate_char(const char *s, const char *e)
{ {
const unsigned char *s = (const unsigned char*)str; unsigned char c = *s++;
const unsigned char *e = s + length;
while (s < e) { if (c <= 0x7F) { /* 00..7F */
unsigned char c = *s++; return 1;
unsigned char c2; } else if (c <= 0xC1) { /* 80..C1 */
int len_minus_two; /* Disallow overlong 2-byte sequence. */
return 0;
} else if (c <= 0xDF) { /* C2..DF */
/* Make sure the character isn't clipped. */
if (e - s < 1)
return 0;
/* Validate the first byte and determine the sequence length. */ /* Make sure subsequent byte is in the range 0x80..0xBF. */
if (c <= 0x7F) /* 00..7F */ if (((unsigned char)*s++ & 0xC0) != 0x80)
continue; return 0;
else if (c <= 0xC1) /* 80..C1 */
return false;
else if (c <= 0xDF) /* C2..DF */
len_minus_two = 0;
else if (c <= 0xEF) /* E0..EF */
len_minus_two = 1;
else if (c <= 0xF4) /* F0..F4 */
len_minus_two = 2;
else
return false;
return 2;
} else if (c <= 0xEF) { /* E0..EF */
/* Make sure the character isn't clipped. */ /* Make sure the character isn't clipped. */
if (s + len_minus_two >= e) if (e - s < 2)
return false; return 0;
/* Disallow overlong 3-byte sequence. */
if (c == 0xE0 && (unsigned char)*s < 0xA0)
return 0;
/* Disallow U+D800..U+DFFF. */
if (c == 0xED && (unsigned char)*s > 0x9F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
c2 = *s; return 3;
} else if (c <= 0xF4) { /* F0..F4 */
/* Make sure the character isn't clipped. */
if (e - s < 3)
return 0;
/* Disallow overlong 4-byte sequence. */
if (c == 0xF0 && (unsigned char)*s < 0x90)
return 0;
/* Disallow codepoints beyond U+10FFFF. */
if (c == 0xF4 && (unsigned char)*s > 0x8F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */ /* Make sure subsequent bytes are in the range 0x80..0xBF. */
do { if (((unsigned char)*s++ & 0xC0) != 0x80)
if ((*s++ & 0xC0) != 0x80) return 0;
return false; if (((unsigned char)*s++ & 0xC0) != 0x80)
} while (len_minus_two--); return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
/* Handle special cases. */ return 4;
switch (c) { } else { /* F5..FF */
case 0xE0: return 0;
/* Disallow overlong 3-byte sequence. */
if (c2 < 0xA0)
return false;
break;
case 0xED:
/* Disallow U+D800..U+DFFF. */
if (c2 > 0x9F)
return false;
break;
case 0xF0:
/* Disallow overlong 4-byte sequence. */
if (c2 < 0x90)
return false;
break;
case 0xF4:
/* Disallow codepoints beyond U+10FFFF. */
if (c2 > 0x8F)
return false;
break;
}
} }
return true;
} }
int utf8_read_char(const char *s, uchar_t *out) int utf8_read_char(const char *s, uchar_t *out)
......
...@@ -42,6 +42,16 @@ typedef uint32_t uchar_t; ...@@ -42,6 +42,16 @@ typedef uint32_t uchar_t;
*/ */
bool utf8_validate(const char *str, size_t length); bool utf8_validate(const char *str, size_t length);
/*
* Validate a single UTF-8 character.
* @s: Beginning of UTF-8 character.
* @e: End of string.
*
* If it's valid, return its length (1 thru 4).
* If it's invalid or clipped, return 0.
*/
int utf8_validate_char(const char *s, const char *e);
/* /*
* Read a single UTF-8 character starting at @s, * Read a single UTF-8 character starting at @s,
* returning the length, in bytes, of the character read. * returning the length, in bytes, of the character read.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment