Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
ccan
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
mirror
ccan
Commits
12af7e37
Commit
12af7e37
authored
Jun 14, 2011
by
Joey Adams
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
charset: Added utf8_validate_char (factored out of utf8_validate).
parent
06c4af31
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
82 additions
and
52 deletions
+82
-52
ccan/charset/charset.c
ccan/charset/charset.c
+72
-52
ccan/charset/charset.h
ccan/charset/charset.h
+10
-0
No files found.
ccan/charset/charset.c
View file @
12af7e37
...
...
@@ -22,6 +22,24 @@
*/
#include "charset.h"
#include <assert.h>
bool
utf8_validate
(
const
char
*
str
,
size_t
length
)
{
const
char
*
s
=
str
;
const
char
*
e
=
str
+
length
;
int
len
;
for
(;
s
<
e
;
s
+=
len
)
{
len
=
utf8_validate_char
(
s
,
e
);
if
(
len
==
0
)
return
false
;
}
assert
(
s
==
e
);
return
true
;
}
/*
* This function implements the syntax given in RFC3629, which is
...
...
@@ -37,68 +55,70 @@
* * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/
bool
utf8_validate
(
const
char
*
str
,
size_t
length
)
int
utf8_validate_char
(
const
char
*
s
,
const
char
*
e
)
{
const
unsigned
char
*
s
=
(
const
unsigned
char
*
)
str
;
const
unsigned
char
*
e
=
s
+
length
;
unsigned
char
c
=
*
s
++
;
while
(
s
<
e
)
{
unsigned
char
c
=
*
s
++
;
unsigned
char
c2
;
int
len_minus_two
;
if
(
c
<=
0x7F
)
{
/* 00..7F */
return
1
;
}
else
if
(
c
<=
0xC1
)
{
/* 80..C1 */
/* Disallow overlong 2-byte sequence. */
return
0
;
}
else
if
(
c
<=
0xDF
)
{
/* C2..DF */
/* Make sure the character isn't clipped. */
if
(
e
-
s
<
1
)
return
0
;
/* Validate the first byte and determine the sequence length. */
if
(
c
<=
0x7F
)
/* 00..7F */
continue
;
else
if
(
c
<=
0xC1
)
/* 80..C1 */
return
false
;
else
if
(
c
<=
0xDF
)
/* C2..DF */
len_minus_two
=
0
;
else
if
(
c
<=
0xEF
)
/* E0..EF */
len_minus_two
=
1
;
else
if
(
c
<=
0xF4
)
/* F0..F4 */
len_minus_two
=
2
;
else
return
false
;
/* Make sure subsequent byte is in the range 0x80..0xBF. */
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
return
2
;
}
else
if
(
c
<=
0xEF
)
{
/* E0..EF */
/* Make sure the character isn't clipped. */
if
(
s
+
len_minus_two
>=
e
)
return
false
;
if
(
e
-
s
<
2
)
return
0
;
/* Disallow overlong 3-byte sequence. */
if
(
c
==
0xE0
&&
(
unsigned
char
)
*
s
<
0xA0
)
return
0
;
/* Disallow U+D800..U+DFFF. */
if
(
c
==
0xED
&&
(
unsigned
char
)
*
s
>
0x9F
)
return
0
;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
c2
=
*
s
;
return
3
;
}
else
if
(
c
<=
0xF4
)
{
/* F0..F4 */
/* Make sure the character isn't clipped. */
if
(
e
-
s
<
3
)
return
0
;
/* Disallow overlong 4-byte sequence. */
if
(
c
==
0xF0
&&
(
unsigned
char
)
*
s
<
0x90
)
return
0
;
/* Disallow codepoints beyond U+10FFFF. */
if
(
c
==
0xF4
&&
(
unsigned
char
)
*
s
>
0x8F
)
return
0
;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
do
{
if
((
*
s
++
&
0xC0
)
!=
0x80
)
return
false
;
}
while
(
len_minus_two
--
);
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
if
(((
unsigned
char
)
*
s
++
&
0xC0
)
!=
0x80
)
return
0
;
/* Handle special cases. */
switch
(
c
)
{
case
0xE0
:
/* Disallow overlong 3-byte sequence. */
if
(
c2
<
0xA0
)
return
false
;
break
;
case
0xED
:
/* Disallow U+D800..U+DFFF. */
if
(
c2
>
0x9F
)
return
false
;
break
;
case
0xF0
:
/* Disallow overlong 4-byte sequence. */
if
(
c2
<
0x90
)
return
false
;
break
;
case
0xF4
:
/* Disallow codepoints beyond U+10FFFF. */
if
(
c2
>
0x8F
)
return
false
;
break
;
}
return
4
;
}
else
{
/* F5..FF */
return
0
;
}
return
true
;
}
int
utf8_read_char
(
const
char
*
s
,
uchar_t
*
out
)
...
...
ccan/charset/charset.h
View file @
12af7e37
...
...
@@ -42,6 +42,16 @@ typedef uint32_t uchar_t;
*/
bool
utf8_validate
(
const
char
*
str
,
size_t
length
);
/*
* Validate a single UTF-8 character.
* @s: Beginning of UTF-8 character.
* @e: End of string.
*
* If it's valid, return its length (1 thru 4).
* If it's invalid or clipped, return 0.
*/
int
utf8_validate_char
(
const
char
*
s
,
const
char
*
e
);
/*
* Read a single UTF-8 character starting at @s,
* returning the length, in bytes, of the character read.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment