Commit c8c69dc6 authored by Joseph Adams's avatar Joseph Adams Committed by Rusty Russell

Joey's charset validation module.

parent e96830ae
#include <stdio.h>
#include <string.h>
#include "config.h"
/**
* charset - character set conversion and validation routines
*
* This module provides a collection (well, only one, at the moment) of
* well-tested routines for dealing with character set nonsense.
*
* Validation functions:
* - bool utf8_validate(const char *str, size_t length);
*
* Example:
* #include <err.h>
* #include <stdio.h>
* #include <string.h>
* #include <ccan/charset/charset.h>
* #include <ccan/grab_file/grab_file.h>
* #include <ccan/talloc/talloc.h> // For talloc_free()
*
* int main(int argc, char *argv[])
* {
* size_t len;
* char *file;
* bool valid;
*
* if (argc != 2)
* err(1, "Expected exactly one argument");
*
* file = grab_file(NULL, argv[1], &len);
* if (!file)
* err(1, "Could not read file %s", argv[1]);
*
* valid = utf8_validate(file, len));
* printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
*
* talloc_free(file);
*
* return 0;
* }
*
* Author: Joey Adams
* Licence: MIT
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
/* Nothing */
return 0;
}
if (strcmp(argv[1], "libs") == 0) {
printf("m\n"); /* Needed for the pow() invocation in run.c */
return 0;
}
return 1;
}
/*
Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "charset.h"
bool utf8_allow_surrogates = false;
bool utf8_validate(const char *str, size_t length)
{
const unsigned char *s = (const unsigned char*)str;
const unsigned char *e = s + length;
while (s < e) {
unsigned char c = *s++;
unsigned int len; /* number of bytes in sequence - 2 */
/* If character is ASCII, move on. */
if (c < 0x80)
continue;
if (s >= e)
return false; /* Missing bytes in sequence. */
if (c < 0xE0) {
/* 2-byte sequence, U+0080 to U+07FF
c must be 11000010 or higher
s[0] must be 10xxxxxx */
len = 0;
if (c < 0xC2)
return false;
} else if (c < 0xF0) {
/* 3-byte sequence, U+0800 to U+FFFF
Note that the surrogate range is U+D800 to U+DFFF
c must be >= 11100000 (which it is)
If c is 11100000, then s[0] must be >= 10100000
If the global parameter utf8_allow_surrogates is false:
If c is 11101101 and s[0] is >= 10100000,
then this is a surrogate and we should fail.
s[0] and s[1] must be 10xxxxxx */
len = 1;
if (c == 0xE0 && *s < 0xA0)
return false;
if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
return false;
} else {
/* 4-byte sequence, U+010000 to U+10FFFF
c must be >= 11110000 (which it is) and <= 11110100
If c is 11110000, then s[0] must be >= 10010000
If c is 11110100, then s[0] must be < 10010000
s[0], s[1], and s[2] must be 10xxxxxx */
len = 2;
if (c > 0xF4)
return false;
if (c == 0xF0 && *s < 0x90)
return false;
if (c == 0xF4 && *s >= 0x90)
return false;
}
if (s + len >= e)
return false; /* Missing bytes in sequence. */
do {
if ((*s++ & 0xC0) != 0x80)
return false;
} while (len--);
}
return true;
}
/*
Note to future contributors: These routines are currently all under the
MIT license. It would be nice to keep it that way :)
*/
/*
Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CCAN_CHARSET_H
#define CCAN_CHARSET_H
#include <stdbool.h>
#include <stddef.h>
/*
* Validate the given UTF-8 string. If it contains '\0' characters,
* it is still valid.
*
* By default, Unicode characters U+D800 thru U+DFFF will be considered
* invalid UTF-8. However, if you set utf8_allow_surrogates to true,
* they will be allowed. Allowing the surrogate range makes it possible
* to losslessly encode malformed UTF-16.
*/
bool utf8_validate(const char *str, size_t length);
/* Default: false */
extern bool utf8_allow_surrogates;
#endif
#include <ccan/charset/charset.h>
#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
/*
* Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
* Uses the BCPL linear congruential generator method.
*
* Used instead of system RNG to ensure tests are consistent.
*/
static uint32_t rand32(void)
{
static uint32_t rand32_state = 0;
rand32_state *= (uint32_t)0x7FF8A3ED;
rand32_state += (uint32_t)0x2AA01D31;
return rand32_state;
}
/*
* Make a Unicode character requiring exactly @len UTF-8 bytes.
*
* Unless utf8_allow_surrogates is set,
* do not return a value in the range U+D800 thru U+DFFF .
*
* If @len is not 1 thru 4, generate an out-of-range character.
*/
static unsigned int utf8_randcode(int len)
{
uint32_t r = rand32();
unsigned int ret;
switch (len) {
case 1: return r % 0x80;
case 2: return r % (0x800-0x80) + 0x80;
case 3:
for (;;) {
ret = r % (0x10000-0x800) + 0x800;
if (!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
{
r = rand32();
continue;
} else {
break;
}
}
return ret;
case 4: return r % (0x110000-0x10000) + 0x10000;
default:
while (r < 0x110000)
r = rand32();
return r;
}
}
static unsigned int rand_surrogate(void)
{
return rand32() % (0xE000 - 0xD800) + 0xD800;
}
/* Encode @uc as UTF-8 using exactly @len characters.
@len should be 1 thru 4.
@uc will be truncated to the bits it will go into.
If, after bit truncation, @uc is in the wrong range for its length,
an invalid character will be generated. */
static void utf8_encode_raw(char *out, unsigned int uc, int len)
{
switch (len) {
case 1:
*out++ = uc & 0x7F;
break;
case 2:
*out++ = 0xC0 | ((uc >> 6) & 0x1F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 3:
*out++ = 0xE0 | ((uc >> 12) & 0x0F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
case 4:
*out++ = 0xF0 | ((uc >> 18) & 0x07);
*out++ = 0x80 | ((uc >> 12) & 0x3F);
*out++ = 0x80 | ((uc >> 6) & 0x3F);
*out++ = 0x80 | (uc & 0x3F);
break;
}
}
/* Generate a UTF-8 string of the given byte length,
randomly deciding if it should be valid or not.
Return true if it's valid, false if it's not. */
static bool utf8_mktest(char *out, int len)
{
int m, n;
bool valid = true;
bool v;
double pf;
uint32_t pu;
/* Probability that, per character, it should be valid.
The goal is to make utf8_mktest as a whole
have a 50% chance of generating a valid string. */
pf = pow(0.5, 2.5/len);
/* Convert to uint32_t to test against rand32. */
pu = pf * 4294967295.0;
for (;len; len -= n) {
v = len == 1 || rand32() <= pu;
m = len < 4 ? len : 4;
if (v) {
/* Generate a valid character. */
n = rand32() % m + 1;
utf8_encode_raw(out, utf8_randcode(n), n);
} else {
/* Generate an invalid character. */
assert(m >= 2);
n = rand32() % (m-1) + 2;
switch (n) {
case 2:
utf8_encode_raw(out, utf8_randcode(1), n);
break;
case 3:
if (!utf8_allow_surrogates && (rand32() & 1))
utf8_encode_raw(out, rand_surrogate(), n);
else
utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
break;
case 4:
utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
break;
}
valid = false;
}
out += n;
}
return valid;
}
static void test_utf8_validate(bool allow_surrogates)
{
char buffer[1024];
int i;
int len;
bool valid;
int passed=0, p_valid=0, p_invalid=0, total=0;
int count;
count = 10000;
utf8_allow_surrogates = allow_surrogates;
for (i=0; i<count; i++) {
len = rand32() % (1024 + 1);
valid = utf8_mktest(buffer, len);
if (utf8_validate(buffer, len) == valid) {
passed++;
if (valid)
p_valid++;
else
p_invalid++;
}
total++;
}
if (passed == total) {
printf("PASS: %d valid tests, %d invalid tests\n",
p_valid, p_invalid);
} else {
printf("FAIL: Passed %d out of %d tests\n", passed, total);
}
ok(passed, "utf8_validate test passed%s",
!allow_surrogates ? " (surrogates disallowed)" : "");
ok(p_valid > count/10 && p_invalid > count/10,
" valid/invalid are balanced");
}
int main(void)
{
/* This is how many tests you plan to run */
plan_tests(4);
test_utf8_validate(false);
test_utf8_validate(true);
/* This exits depending on whether all tests passed */
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment