Commit 12af7e37 authored by Joey Adams's avatar Joey Adams

charset: Added utf8_validate_char (factored out of utf8_validate).

parent 06c4af31
......@@ -22,6 +22,24 @@
*/
#include "charset.h"
#include <assert.h>
bool utf8_validate(const char *str, size_t length)
{
const char *s = str;
const char *e = str + length;
int len;
for (; s < e; s += len) {
len = utf8_validate_char(s, e);
if (len == 0)
return false;
}
assert(s == e);
return true;
}
/*
* This function implements the syntax given in RFC3629, which is
......@@ -37,68 +55,70 @@
* * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/
bool utf8_validate(const char *str, size_t length)
int utf8_validate_char(const char *s, const char *e)
{
const unsigned char *s = (const unsigned char*)str;
const unsigned char *e = s + length;
unsigned char c = *s++;
while (s < e) {
unsigned char c = *s++;
unsigned char c2;
int len_minus_two;
if (c <= 0x7F) { /* 00..7F */
return 1;
} else if (c <= 0xC1) { /* 80..C1 */
/* Disallow overlong 2-byte sequence. */
return 0;
} else if (c <= 0xDF) { /* C2..DF */
/* Make sure the character isn't clipped. */
if (e - s < 1)
return 0;
/* Validate the first byte and determine the sequence length. */
if (c <= 0x7F) /* 00..7F */
continue;
else if (c <= 0xC1) /* 80..C1 */
return false;
else if (c <= 0xDF) /* C2..DF */
len_minus_two = 0;
else if (c <= 0xEF) /* E0..EF */
len_minus_two = 1;
else if (c <= 0xF4) /* F0..F4 */
len_minus_two = 2;
else
return false;
/* Make sure subsequent byte is in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
return 2;
} else if (c <= 0xEF) { /* E0..EF */
/* Make sure the character isn't clipped. */
if (s + len_minus_two >= e)
return false;
if (e - s < 2)
return 0;
/* Disallow overlong 3-byte sequence. */
if (c == 0xE0 && (unsigned char)*s < 0xA0)
return 0;
/* Disallow U+D800..U+DFFF. */
if (c == 0xED && (unsigned char)*s > 0x9F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
c2 = *s;
return 3;
} else if (c <= 0xF4) { /* F0..F4 */
/* Make sure the character isn't clipped. */
if (e - s < 3)
return 0;
/* Disallow overlong 4-byte sequence. */
if (c == 0xF0 && (unsigned char)*s < 0x90)
return 0;
/* Disallow codepoints beyond U+10FFFF. */
if (c == 0xF4 && (unsigned char)*s > 0x8F)
return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
do {
if ((*s++ & 0xC0) != 0x80)
return false;
} while (len_minus_two--);
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
if (((unsigned char)*s++ & 0xC0) != 0x80)
return 0;
/* Handle special cases. */
switch (c) {
case 0xE0:
/* Disallow overlong 3-byte sequence. */
if (c2 < 0xA0)
return false;
break;
case 0xED:
/* Disallow U+D800..U+DFFF. */
if (c2 > 0x9F)
return false;
break;
case 0xF0:
/* Disallow overlong 4-byte sequence. */
if (c2 < 0x90)
return false;
break;
case 0xF4:
/* Disallow codepoints beyond U+10FFFF. */
if (c2 > 0x8F)
return false;
break;
}
return 4;
} else { /* F5..FF */
return 0;
}
return true;
}
int utf8_read_char(const char *s, uchar_t *out)
......
......@@ -42,6 +42,16 @@ typedef uint32_t uchar_t;
*/
bool utf8_validate(const char *str, size_t length);
/*
* Validate a single UTF-8 character.
* @s: Beginning of UTF-8 character.
* @e: End of string.
*
* If it's valid, return its length (1 thru 4).
* If it's invalid or clipped, return 0.
*/
int utf8_validate_char(const char *s, const char *e);
/*
* Read a single UTF-8 character starting at @s,
* returning the length, in bytes, of the character read.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment