Commit 06c4af31 authored by Joey Adams's avatar Joey Adams

charset: Rewrote utf8_validate, and added four new functions:

 * utf8_read_char
 * utf8_write_char
 * from_surrogate_pair
 * to_surrogate_pair
parent 23319007
......@@ -5,40 +5,151 @@
/**
* charset - character set conversion and validation routines
*
* This module provides a collection (well, only one, at the moment) of
* well-tested routines for dealing with character set nonsense.
*
* Validation functions:
* - bool utf8_validate(const char *str, size_t length);
* This module provides a collection of well-tested routines
* for dealing with character set nonsense.
*
* Example:
* #include <err.h>
* #include <stdio.h>
* #include <stdlib.h>
* #include <string.h>
* #include <ccan/charset/charset.h>
* #include <ccan/grab_file/grab_file.h>
* #include <ccan/talloc/talloc.h> // For talloc_free()
*
* int main(int argc, char *argv[])
* #include <ccan/talloc/talloc.h>
*
* static void print_json_string(const char *s);
* static bool parse_hex16(const char **sp, unsigned int *out);
*
* // Take a JSON-encoded string on input and print its literal value.
* int main(void)
* {
* size_t len;
* char *file;
* bool valid;
*
* if (argc != 2)
* err(1, "Expected exactly one argument");
*
* file = grab_file(NULL, argv[1], &len);
* if (!file)
* err(1, "Could not read file %s", argv[1]);
*
* valid = utf8_validate(file, len);
* printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
*
* talloc_free(file);
*
* char *input;
* size_t length;
*
* input = grab_file(NULL, NULL, &length);
* if (!input)
* err(1, "Error reading input");
* if (!utf8_validate(input, length)) {
* fprintf(stderr, "Input contains invalid UTF-8\n");
* return 1;
* }
* if (strlen(input) != length) {
* fprintf(stderr, "Input contains null characters\n");
* return 1;
* }
*
* print_json_string(input);
*
* talloc_free(input);
* return 0;
* }
*
* static void print_json_string(const char *s)
* {
* char output_buffer[4];
*
* // Skip leading whitespace
* while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
* s++;
*
* if (*s++ != '"') {
* fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n");
* exit(EXIT_FAILURE);
* }
*
* while (*s != '"') {
* unsigned char c = *s++;
* char *b = output_buffer;
*
* if (c == '\\') {
* c = *s++;
* switch (c) {
* case '"':
* case '\\':
* case '/':
* *b++ = c;
* break;
* case 'b': *b++ = '\b'; break;
* case 'f': *b++ = '\f'; break;
* case 'n': *b++ = '\n'; break;
* case 'r': *b++ = '\r'; break;
* case 't': *b++ = '\t'; break;
* case 'u': {
* unsigned int uc, lc;
*
* if (!parse_hex16(&s, &uc))
* goto syntax_error;
*
* if (uc >= 0xD800 && uc <= 0xDFFF) {
* // Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E").
* uchar_t unicode;
*
* if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc))
* goto syntax_error;
*
* unicode = from_surrogate_pair(uc, lc);
* if (unicode == REPLACEMENT_CHARACTER) {
* fprintf(stderr, "Invalid surrogate pair.\n");
* exit(EXIT_FAILURE);
* }
*
* b += utf8_write_char(unicode, b);
* } else {
* // Handle ordinary Unicode escape (e.g. "\u266B").
* b += utf8_write_char(uc, b);
* }
*
* break;
* }
* default:
* goto syntax_error;
* }
* } else if (c <= 0x1F) {
* // Control characters are not allowed in string literals.
* goto syntax_error;
* } else {
* *b++ = c;
* }
*
* fwrite(output_buffer, 1, b - output_buffer, stdout);
* }
*
* putchar('\n');
* return;
*
* syntax_error:
* fprintf(stderr, "Syntax error in JSON string literal.\n");
* exit(EXIT_FAILURE);
* }
*
* static bool parse_hex16(const char **sp, unsigned int *out)
* {
* const char *s = *sp;
* unsigned int ret = 0;
* unsigned int i;
* unsigned int tmp;
* char c;
*
* for (i = 0; i < 4; i++)
* {
* c = *s++;
* if (c >= '0' && c <= '9')
* tmp = c - '0';
* else if (c >= 'A' && c <= 'F')
* tmp = c - 'A' + 10;
* else if (c >= 'a' && c <= 'f')
* tmp = c - 'a' + 10;
* else
* return false;
*
* ret <<= 4;
* ret += tmp;
* }
*
* *out = ret;
* *sp = s;
* return true;
* }
*
* Author: Joey Adams
* License: MIT
......
......@@ -23,8 +23,20 @@
#include "charset.h"
bool utf8_allow_surrogates = false;
/*
* This function implements the syntax given in RFC3629, which is
* the same as that given in The Unicode Standard, Version 6.0.
*
* It has the following properties:
*
* * All codepoints U+0000..U+10FFFF may be encoded,
* except for U+D800..U+DFFF, which are reserved
* for UTF-16 surrogate pair encoding.
* * UTF-8 byte sequences longer than 4 bytes are not permitted,
* as they exceed the range of Unicode.
* * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/
bool utf8_validate(const char *str, size_t length)
{
const unsigned char *s = (const unsigned char*)str;
......@@ -32,69 +44,145 @@ bool utf8_validate(const char *str, size_t length)
while (s < e) {
unsigned char c = *s++;
unsigned int len; /* number of bytes in sequence - 2 */
unsigned char c2;
int len_minus_two;
/* If character is ASCII, move on. */
if (c < 0x80)
/* Validate the first byte and determine the sequence length. */
if (c <= 0x7F) /* 00..7F */
continue;
else if (c <= 0xC1) /* 80..C1 */
return false;
else if (c <= 0xDF) /* C2..DF */
len_minus_two = 0;
else if (c <= 0xEF) /* E0..EF */
len_minus_two = 1;
else if (c <= 0xF4) /* F0..F4 */
len_minus_two = 2;
else
return false;
if (s >= e)
return false; /* Missing bytes in sequence. */
if (c < 0xE0) {
/* 2-byte sequence, U+0080 to U+07FF
c must be 11000010 or higher
s[0] must be 10xxxxxx */
len = 0;
if (c < 0xC2)
return false;
} else if (c < 0xF0) {
/* 3-byte sequence, U+0800 to U+FFFF
Note that the surrogate range is U+D800 to U+DFFF,
and that U+FFFE and U+FFFF are illegal characters.
c must be >= 11100000 (which it is)
If c is 11100000, then s[0] must be >= 10100000
If the global parameter utf8_allow_surrogates is false:
If c is 11101101 and s[0] is >= 10100000,
then this is a surrogate and we should fail.
If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
then this is an illegal character and we should fail.
s[0] and s[1] must be 10xxxxxx */
len = 1;
if (c == 0xE0 && *s < 0xA0)
return false;
if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
return false;
if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
return false;
} else {
/* 4-byte sequence, U+010000 to U+10FFFF
c must be >= 11110000 (which it is) and <= 11110100
If c is 11110000, then s[0] must be >= 10010000
If c is 11110100, then s[0] must be < 10010000
s[0], s[1], and s[2] must be 10xxxxxx */
len = 2;
if (c > 0xF4)
return false;
if (c == 0xF0 && *s < 0x90)
return false;
if (c == 0xF4 && *s >= 0x90)
return false;
}
/* Make sure the character isn't clipped. */
if (s + len_minus_two >= e)
return false;
if (s + len >= e)
return false; /* Missing bytes in sequence. */
c2 = *s;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
do {
if ((*s++ & 0xC0) != 0x80)
return false;
} while (len--);
} while (len_minus_two--);
/* Handle special cases. */
switch (c) {
case 0xE0:
/* Disallow overlong 3-byte sequence. */
if (c2 < 0xA0)
return false;
break;
case 0xED:
/* Disallow U+D800..U+DFFF. */
if (c2 > 0x9F)
return false;
break;
case 0xF0:
/* Disallow overlong 4-byte sequence. */
if (c2 < 0x90)
return false;
break;
case 0xF4:
/* Disallow codepoints beyond U+10FFFF. */
if (c2 > 0x8F)
return false;
break;
}
}
return true;
}
/*
Note to future contributors: These routines are currently all under the
MIT license. It would be nice to keep it that way :)
*/
int utf8_read_char(const char *s, uchar_t *out)
{
const unsigned char *c = (const unsigned char*) s;
if (c[0] <= 0x7F) {
/* 00..7F */
*out = c[0];
return 1;
} else if (c[0] <= 0xDF) {
/* C2..DF (unless input is invalid) */
*out = ((uchar_t)c[0] & 0x1F) << 6 |
((uchar_t)c[1] & 0x3F);
return 2;
} else if (c[0] <= 0xEF) {
/* E0..EF */
*out = ((uchar_t)c[0] & 0xF) << 12 |
((uchar_t)c[1] & 0x3F) << 6 |
((uchar_t)c[2] & 0x3F);
return 3;
} else {
/* F0..F4 (unless input is invalid) */
*out = ((uchar_t)c[0] & 0x7) << 18 |
((uchar_t)c[1] & 0x3F) << 12 |
((uchar_t)c[2] & 0x3F) << 6 |
((uchar_t)c[3] & 0x3F);
return 4;
}
}
int utf8_write_char(uchar_t unicode, char *out)
{
unsigned char *o = (unsigned char*) out;
if (unicode <= 0x7F) {
/* U+0000..U+007F */
*o++ = unicode;
return 1;
} else if (unicode <= 0x7FF) {
/* U+0080..U+07FF */
*o++ = 0xC0 | unicode >> 6;
*o++ = 0x80 | (unicode & 0x3F);
return 2;
} else if (unicode <= 0xFFFF) {
/* U+0800..U+FFFF */
if (unicode >= 0xD800 && unicode <= 0xDFFF)
unicode = REPLACEMENT_CHARACTER;
three_byte_character:
*o++ = 0xE0 | unicode >> 12;
*o++ = 0x80 | (unicode >> 6 & 0x3F);
*o++ = 0x80 | (unicode & 0x3F);
return 3;
} else if (unicode <= 0x10FFFF) {
/* U+10000..U+10FFFF */
*o++ = 0xF0 | unicode >> 18;
*o++ = 0x80 | (unicode >> 12 & 0x3F);
*o++ = 0x80 | (unicode >> 6 & 0x3F);
*o++ = 0x80 | (unicode & 0x3F);
return 4;
} else {
/* U+110000... */
unicode = REPLACEMENT_CHARACTER;
goto three_byte_character;
}
}
uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
{
if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
else
return REPLACEMENT_CHARACTER;
}
bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
{
if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
uchar_t n = unicode - 0x10000;
*uc = ((n >> 10) & 0x3FF) | 0xD800;
*lc = (n & 0x3FF) | 0xDC00;
return true;
} else {
*uc = *lc = REPLACEMENT_CHARACTER;
return false;
}
}
......@@ -26,19 +26,57 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#define REPLACEMENT_CHARACTER 0xFFFD
/*
* Validate the given UTF-8 string. If it contains '\0' characters,
* it is still valid.
*
* By default, Unicode characters U+D800 thru U+DFFF will be considered
* invalid UTF-8. However, if you set utf8_allow_surrogates to true,
* they will be allowed. Allowing the surrogate range makes it possible
* to losslessly encode malformed UTF-16.
* Type for Unicode codepoints.
* We need our own because wchar_t might be 16 bits.
*/
typedef uint32_t uchar_t;
/*
* Validate the given UTF-8 string.
* If it contains '\0' characters, it is still valid.
*/
bool utf8_validate(const char *str, size_t length);
/* Default: false */
extern bool utf8_allow_surrogates;
/*
* Read a single UTF-8 character starting at @s,
* returning the length, in bytes, of the character read.
*
* This function assumes input is valid UTF-8,
* and that there are enough characters in front of @s.
*/
int utf8_read_char(const char *s, uchar_t *out);
/*
* Write a single UTF-8 character to @s,
* returning the length, in bytes, of the character written.
*
* @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
* If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
*
* This function will write up to 4 bytes to @out.
*/
int utf8_write_char(uchar_t unicode, char *out);
/*
* Compute the Unicode codepoint of a UTF-16 surrogate pair.
*
* @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
* If they aren't, this function returns REPLACEMENT_CHARACTER.
*/
uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
/*
* Construct a UTF-16 surrogate pair given a Unicode codepoint.
*
* @unicode should be U+10000..U+10FFFF.
* If it's not, this function returns false,
* and sets *uc and *lc to REPLACEMENT_CHARACTER.
*/
bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
#endif
#include <stdint.h>
#include <stdlib.h>
/*
* Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
* Uses the BCPL linear congruential generator method.
*
* Used instead of system RNG to ensure tests are consistent.
*/
static uint32_t rand32(void)
{
#if 0
/*
* Tests should be run with a different random function
* from time to time. I've found that the method below
* sometimes behaves poorly for testing purposes.
* For example, rand32() % N might only return even numbers.
*/
assert(RAND_MAX == 2147483647);
return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
#else
static uint32_t rand32_state = 0;
rand32_state *= (uint32_t)0x7FF8A3ED;
rand32_state += (uint32_t)0x2AA01D31;
return rand32_state;
#endif
}
#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>
#include <string.h>
#include "common.h"
/*
* Testing procedure for from_surrogate_pair and to_surrogate_pair:
*
* * For each Unicode code point from 0x10000 to 0x10FFFF:
* - Call to_surrogate_pair, and make sure that:
* - It returns true.
* - uc is 0xD800..0xDBFF
* - lc is 0xDC00..0xDFFF
* - Call from_surrogate_pair on the pair, and make sure that
* it returns the original character.
* * For various invalid arguments to to_surrogate_pair
* (U+0000..U+FFFF and U+110000...):
* - Call to_surrogate_pair, and make sure it:
* - Returns false.
* - Sets *uc and *lc to REPLACEMENT_CHARACTER.
* * For various invalid arguments to from_surrogate_pair
* (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF):
* - Call from_surrogate_pair, and make sure
* it returns REPLACEMENT_CHARACTER.
*/
#define INVALID_TRIAL_COUNT 10000
#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo))
static void test_valid(void)
{
uchar_t unicode;
unsigned int uc, lc;
for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) {
if (to_surrogate_pair(unicode, &uc, &lc) != true) {
fail("to_surrogate_pair did not return true on valid input.");
return;
}
if (!(uc >= 0xD800 && uc <= 0xDBFF)) {
fail("to_surrogate_pair: uc is out of range");
return;
}
if (!(lc >= 0xDC00 && lc <= 0xDFFF)) {
fail("to_surrogate_pair: lc is out of range");
return;
}
if (from_surrogate_pair(uc, lc) != unicode) {
fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode);
return;
}
}
pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments.");
}
static void test_invalid_to_surrogate_pair(void)
{
long i;
uchar_t unicode;
unsigned int uc, lc;
for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
if (rand32() % 2) {
unicode = range(rand32(), 0x0, 0xFFFF);
} else {
do {
unicode = rand32();
} while (unicode < 0x110000);
}
if (to_surrogate_pair(unicode, &uc, &lc) != false) {
fail("to_surrogate_pair did not return false on invalid input.");
return;
}
if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) {
fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input.");
return;
}
}
pass("to_surrogate_pair seems to handle invalid argument values properly.");
}
static void test_invalid_from_surrogate_pair(void)
{
long i;
unsigned int uc, lc;
for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
switch (rand32() % 3) {
case 0:
uc = range(rand32(), 0x0, 0xD7FF);
break;
case 1:
uc = range(rand32(), 0xDC00, 0xDFFF);
break;
default:
uc = range(rand32(), 0xE000, 0xFFFF);
break;
}
switch (rand32() % 3) {
case 0:
lc = range(rand32(), 0x0, 0xD7FF);
break;
case 1:
lc = range(rand32(), 0xD800, 0xDBFF);
break;
default:
lc = range(rand32(), 0xE000, 0xFFFF);
break;
}
if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) {
fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc);
return;
}
}
pass("from_surrogate_pair seems to handle invalid arguments properly.");
}
int main(void)
{
plan_tests(3);
test_valid();
test_invalid_to_surrogate_pair();
test_invalid_from_surrogate_pair();
return exit_status();
}
#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>
#include <string.h>
#include "common.h"
/*
* Testing procedure for utf8_read_char and utf8_write_char:
*
* * Generate N valid and invalid Unicode code points.
* * Encode them with utf8_write_char.
* * Copy the resulting string into a buffer sized exactly as big as
* the string produced. This way, Valgrind can catch buffer overflows
* by utf8_validate and utf8_read_char.
* * Validate the string with utf8_validate.
* * Decode the string, ensuring that:
* - Valid codepoints are read back.
* - Invalid characters are read back, but replaced
* with REPLACEMENT_CHARACTER.
* - No extra characters are read back.
*/
#define TRIAL_COUNT 1000
#define MAX_CHARS_PER_TRIAL 100
#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo))
int main(void)
{
int trial;
plan_tests(TRIAL_COUNT);
for (trial = 1; trial <= TRIAL_COUNT; trial++) {
int i, count;
uchar_t codepoints[MAX_CHARS_PER_TRIAL];
uchar_t c;
bool c_valid;
char write_buffer[MAX_CHARS_PER_TRIAL * 4];
char *o = write_buffer;
char *oe = write_buffer + sizeof(write_buffer);
char *string;
const char *s;
const char *e;
int len;
count = rand32() % MAX_CHARS_PER_TRIAL + 1;
for (i = 0; i < count; i++) {
if (o >= oe) {
fail("utf8_write_char: Buffer overflow (1)");
goto next_trial;
}
switch (rand32() % 7) {
case 0:
c = range(rand32(), 0x0, 0x7F);
c_valid = true;
break;
case 1:
c = range(rand32(), 0x80, 0x7FF);
c_valid = true;
break;
case 2:
c = range(rand32(), 0x800, 0xD7FF);
c_valid = true;
break;
case 3:
c = range(rand32(), 0xD800, 0xDFFF);
c_valid = false;
break;
case 4:
c = range(rand32(), 0xE000, 0xFFFF);
c_valid = true;
break;
case 5:
c = range(rand32(), 0x10000, 0x10FFFF);
c_valid = true;
break;
default:
do {
c = rand32();
} while (c < 0x110000);
c_valid = false;
break;
}
codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
len = utf8_write_char(c, o);
if (len < 1 || len > 4) {
fail("utf8_write_char: Return value is not 1 thru 4.");
goto next_trial;
}
o += len;
}
if (o > oe) {
fail("utf8_write_char: Buffer overflow (2)");
goto next_trial;
}
string = malloc(o - write_buffer);
memcpy(string, write_buffer, o - write_buffer);
s = string;
e = string + (o - write_buffer);
if (!utf8_validate(s, e - s)) {
fail("Invalid string produced by utf8_write_char.");
goto next_trial_free_string;
}
for (i = 0; i < count; i++) {
if (s >= e) {
fail("utf8_read_char: Buffer overflow (1)");
goto next_trial_free_string;
}
len = utf8_read_char(s, &c);
if (len < 1 || len > 4) {
fail("utf8_read_char: Return value is not 1 thru 4.");
goto next_trial_free_string;
}
if (c != codepoints[i]) {
fail("utf8_read_char: Character read differs from that written.");
goto next_trial_free_string;
}
s += len;
}
if (s > e) {
fail("utf8_read_char: Buffer overflow (2)");
goto next_trial_free_string;
}
if (s < e) {
fail("utf8_read_char: Did not reach end of string.");
goto next_trial_free_string;
}
pass("Trial %d: %d characters", trial, count);
next_trial_free_string:
free(string);
next_trial:;
}
return exit_status();
}
#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include "common.h"
/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
{
uint32_t r = rand32();
uchar_t ret;
#define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
#define high_bit_set() (!!(r & 0x80000000))
switch (len) {
case 1:
if (valid) {
/* Generate a character U+0000..U+007F */
return r & 0x7F;
} else {
/*
* Generate a character U+0080..U+00BF or U+00F8..U+00FF.
*
* However, don't generate U+0080..U+00BF (10xxxxxx) after a
* clipped character, as that can inadvertently form a valid,
* complete character.
*/
if (!after_clipped && high_bit_set())
return range(0x80, 0xBF);
else
return range(0xF8, 0xFF);
}
case 2:
if (valid) {
/* Generate a character U+0080..U+07FF */
return range(0x80, 0x7FF);
} else {
/* Generate a character U+0000..U+007F */
return r & 0x7F;
}
case 3:
if (valid) {
/* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
for (;;) {
ret = range(0x800, 0xFFFF);
if (ret >= 0xD800 && ret <= 0xDFFF) {
r = rand32();
continue;
} else {
break;
}
}
return ret;
} else {
/* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
if (high_bit_set())
return r & 0x7FF;
else
return 0xD800 + (r & 0x7FF);
}
case 4:
if (valid) {
/* Generate a character U+10000..U+10FFFF */
return range(0x10000, 0x10FFFF);
} else {
/* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
if (high_bit_set())
return r & 0xFFFF;
else
return range(0x110000, 0x1FFFFF);
}
default:
assert(false);
}
#undef range
#undef high_bit_set
}
/* Encode @uc as UTF-8 using exactly @len characters.
@len should be 1 thru 4. */
static void utf8_encode_raw(char *out, unsigned int uc, int len)
{
switch (len) {
case 1:
assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
*out++ = uc;
break;
case 2:
assert(uc <= 0x7FF);
*out++ = 0xC0 | ((uc >> 6) & 0x1F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 3:
assert(uc <= 0xFFFF);
*out++ = 0xE0 | ((uc >> 12) & 0x0F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 4:
assert(uc <= 0x1FFFFF);
*out++ = 0xF0 | ((uc >> 18) & 0x07);
*out++ = 0x80 | ((uc >> 12) & 0x3F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
}
}
#if COMPUTE_AVERAGE_LENGTH
double total_averages;
#endif
/* Generate a UTF-8 string of the given byte length,
randomly deciding if it should be valid or not.
Return true if it's valid, false if it's not. */
static bool utf8_mktest(char *out, int len)
{
double pf;
uint32_t pu;
int n;
bool valid = true;
bool v;
bool after_clipped = false;
#if COMPUTE_AVERAGE_LENGTH
int n_total = 0;
int count = 0;
#endif
/*
* Probability that, per character, it should be valid.
* The goal is to make utf8_mktest as a whole
* have a 50% chance of generating a valid string.
*
* The equation being solved is:
*
* p^n = 0.5
*
* where p is the probability that each character is valid,
* and n is the number of characters in the string.
*
* 2.384 is the approximate average length of each character,
* so len/2.384 is about how many characters this string
* is expected to contain.
*/
pf = pow(0.5, 2.384/len);
/* Convert to uint32_t to test against rand32. */
pu = pf * 4294967295.0;
for (;len > 0; len -= n, out += n) {
v = rand32() <= pu;
if (v) {
/* Generate a valid character. */
n = rand32() % (len < 4 ? len : 4) + 1;
utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
after_clipped = false;
} else if (rand32() % 5) {
/* Generate an invalid character. */
n = rand32() % (len < 4 ? len : 4) + 1;
utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
after_clipped = false;
} else {
/* Generate a clipped but otherwise valid character. */
char tmp[4];
n = rand32() % 3 + 2;
utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
n -= rand32() % (n-1) + 1;
if (n > len)
n = len;
assert(n >= 1 && n <= 3);
memcpy(out, tmp, n);
after_clipped = true;
}
if (!v)
valid = false;
#if COMPUTE_AVERAGE_LENGTH
n_total += n;
count++;
#endif
}
#if COMPUTE_AVERAGE_LENGTH
if (count > 0)
total_averages += (double)n_total / count;
#endif
return valid;
}
static void test_utf8_validate(void)
{
char buffer[128];
int i;
int len;
bool valid;
int passed=0, p_valid=0, p_invalid=0, total=0;
int count;
count = 100000;
#if COMPUTE_AVERAGE_LENGTH
total_averages = 0.0;
#endif
for (i=0; i<count; i++) {
len = rand32() % (sizeof(buffer) + 1);
valid = utf8_mktest(buffer, len);
if (utf8_validate(buffer, len) == valid) {
passed++;
if (valid)
p_valid++;
else
p_invalid++;
} else {
bool uvalid = utf8_validate(buffer, len);
printf("Failed: generated %s string, but utf8_validate returned %s\n",
valid ? "valid" : "invalid",
uvalid ? "true" : "false");
}
total++;
}
if (passed == total)
pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
else
fail("Passed only %d out of %d tests\n", passed, total);
ok(p_valid > count/10 && p_invalid > count/10,
"Valid and invalid should be balanced");
#if COMPUTE_AVERAGE_LENGTH
printf("Average character length: %f\n", total_averages / count);
#endif
}
int main(void)
{
/* This is how many tests you plan to run */
plan_tests(2);
test_utf8_validate();
/* This exits depending on whether all tests passed */
return exit_status();
}
#include <ccan/charset/charset.h>
#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
/*
* Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
* Uses the BCPL linear congruential generator method.
*
* Used instead of system RNG to ensure tests are consistent.
*/
static uint32_t rand32(void)
{
static uint32_t rand32_state = 0;
rand32_state *= (uint32_t)0x7FF8A3ED;
rand32_state += (uint32_t)0x2AA01D31;
return rand32_state;
}
/*
* Make a Unicode character requiring exactly @len UTF-8 bytes.
*
* Unless utf8_allow_surrogates is set,
* do not return a value in the range U+D800 thru U+DFFF .
*
* If @len is not 1 thru 4, generate an out-of-range character.
*/
static unsigned int utf8_randcode(int len)
{
uint32_t r = rand32();
unsigned int ret;
switch (len) {
case 1: return r % 0x80;
case 2: return r % (0x800-0x80) + 0x80;
case 3:
for (;;) {
ret = r % (0x10000-0x800) + 0x800;
if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
|| ret >= 0xFFFE)
{
r = rand32();
continue;
} else {
break;
}
}
return ret;
case 4: return r % (0x110000-0x10000) + 0x10000;
default:
while (r < 0x110000)
r = rand32();
return r;
}
}
static unsigned int rand_surrogate(void)
{
return rand32() % (0xE000 - 0xD800) + 0xD800;
}
/* Encode @uc as UTF-8 using exactly @len characters.
@len should be 1 thru 4.
@uc will be truncated to the bits it will go into.
If, after bit truncation, @uc is in the wrong range for its length,
an invalid character will be generated. */
static void utf8_encode_raw(char *out, unsigned int uc, int len)
{
switch (len) {
case 1:
*out++ = uc & 0x7F;
break;
case 2:
*out++ = 0xC0 | ((uc >> 6) & 0x1F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 3:
*out++ = 0xE0 | ((uc >> 12) & 0x0F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 4:
*out++ = 0xF0 | ((uc >> 18) & 0x07);
*out++ = 0x80 | ((uc >> 12) & 0x3F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
}
}
/* Generate a UTF-8 string of the given byte length,
randomly deciding if it should be valid or not.
Return true if it's valid, false if it's not. */
static bool utf8_mktest(char *out, int len)
{
int m, n;
bool valid = true;
bool v;
double pf;
uint32_t pu;
/* Probability that, per character, it should be valid.
The goal is to make utf8_mktest as a whole
have a 50% chance of generating a valid string. */
pf = pow(0.5, 2.5/len);
/* Convert to uint32_t to test against rand32. */
pu = pf * 4294967295.0;
for (;len; len -= n) {
v = len == 1 || rand32() <= pu;
m = len < 4 ? len : 4;
if (v) {
/* Generate a valid character. */
n = rand32() % m + 1;
utf8_encode_raw(out, utf8_randcode(n), n);
} else {
/* Generate an invalid character. */
assert(m >= 2);
n = rand32() % (m-1) + 2;
switch (n) {
case 2:
utf8_encode_raw(out, utf8_randcode(1), n);
break;
case 3:
if (!utf8_allow_surrogates && (rand32() & 1))
utf8_encode_raw(out, rand_surrogate(), n);
else
utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
break;
case 4:
utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
break;
}
valid = false;
}
out += n;
}
return valid;
}
static void test_utf8_validate(bool allow_surrogates)
{
char buffer[1024];
int i;
int len;
bool valid;
int passed=0, p_valid=0, p_invalid=0, total=0;
int count;
count = 10000;
utf8_allow_surrogates = allow_surrogates;
for (i=0; i<count; i++) {
len = rand32() % (1024 + 1);
valid = utf8_mktest(buffer, len);
if (utf8_validate(buffer, len) == valid) {
passed++;
if (valid)
p_valid++;
else
p_invalid++;
}
total++;
}
if (passed == total) {
printf("PASS: %d valid tests, %d invalid tests\n",
p_valid, p_invalid);
} else {
printf("FAIL: Passed %d out of %d tests\n", passed, total);
}
ok(passed, "utf8_validate test passed%s",
!allow_surrogates ? " (surrogates disallowed)" : "");
ok(p_valid > count/10 && p_invalid > count/10,
" valid/invalid are balanced");
}
int main(void)
{
/* This is how many tests you plan to run */
plan_tests(4);
test_utf8_validate(false);
test_utf8_validate(true);
/* This exits depending on whether all tests passed */
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment