Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
13c3e380
Commit
13c3e380
authored
Aug 14, 2007
by
Martin v. Löwis
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add XID_Start and XID_Continue properties to unicodectype.
parent
ff398c6f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
1020 additions
and
899 deletions
+1020
-899
Include/unicodeobject.h
Include/unicodeobject.h
+12
-0
Objects/unicodectype.c
Objects/unicodectype.c
+22
-0
Objects/unicodetype_db.h
Objects/unicodetype_db.h
+930
-886
Tools/unicode/makeunicodedata.py
Tools/unicode/makeunicodedata.py
+56
-13
No files found.
Include/unicodeobject.h
View file @
13c3e380
...
...
@@ -205,6 +205,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
...
...
@@ -289,6 +291,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
...
...
@@ -1274,6 +1278,14 @@ PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
Py_UNICODE
ch
/* Unicode character */
);
PyAPI_FUNC
(
int
)
_PyUnicode_IsXidStart
(
Py_UNICODE
ch
/* Unicode character */
);
PyAPI_FUNC
(
int
)
_PyUnicode_IsXidContinue
(
Py_UNICODE
ch
/* Unicode character */
);
PyAPI_FUNC
(
int
)
_PyUnicode_IsWhitespace
(
const
Py_UNICODE
ch
/* Unicode character */
);
...
...
Objects/unicodectype.c
View file @
13c3e380
...
...
@@ -19,6 +19,8 @@
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
typedef
struct
{
const
Py_UNICODE
upper
;
...
...
@@ -98,6 +100,26 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
return
(
ctype
->
flags
&
TITLE_MASK
)
!=
0
;
}
/* Returns 1 for Unicode characters having the XID_Start property, 0
otherwise. */
int
_PyUnicode_IsXidStart
(
Py_UNICODE
ch
)
{
const
_PyUnicode_TypeRecord
*
ctype
=
gettyperecord
(
ch
);
return
(
ctype
->
flags
&
XID_START_MASK
)
!=
0
;
}
/* Returns 1 for Unicode characters having the XID_Continue property,
0 otherwise. */
int
_PyUnicode_IsXidContinue
(
Py_UNICODE
ch
)
{
const
_PyUnicode_TypeRecord
*
ctype
=
gettyperecord
(
ch
);
return
(
ctype
->
flags
&
XID_CONTINUE_MASK
)
!=
0
;
}
/* Returns the integer decimal (0-9) for Unicode characters having
this property, -1 otherwise. */
...
...
Objects/unicodetype_db.h
View file @
13c3e380
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Tools/unicode/makeunicodedata.py
View file @
13c3e380
...
...
@@ -34,6 +34,7 @@ UNIDATA_VERSION = "4.1.0"
UNICODE_DATA
=
"UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS
=
"CompositionExclusions%s.txt"
EASTASIAN_WIDTH
=
"EastAsianWidth%s.txt"
DERIVED_CORE_PROPERTIES
=
"DerivedCoreProperties%s.txt"
old_versions
=
[
"3.2.0"
]
...
...
@@ -57,6 +58,8 @@ LINEBREAK_MASK = 0x10
SPACE_MASK
=
0x20
TITLE_MASK
=
0x40
UPPER_MASK
=
0x80
XID_START_MASK
=
0x100
XID_CONTINUE_MASK
=
0x200
def
maketables
(
trace
=
0
):
...
...
@@ -65,16 +68,18 @@ def maketables(trace=0):
version
=
""
unicode
=
UnicodeData
(
UNICODE_DATA
%
version
,
COMPOSITION_EXCLUSIONS
%
version
,
EASTASIAN_WIDTH
%
version
)
EASTASIAN_WIDTH
%
version
,
DERIVED_CORE_PROPERTIES
%
version
)
print
(
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
)
print
(
len
(
list
(
filter
(
None
,
unicode
.
table
)
)),
"characters"
)
for
version
in
old_versions
:
print
(
"--- Reading"
,
UNICODE_DATA
%
(
"-"
+
version
),
"..."
)
old_unicode
=
UnicodeData
(
UNICODE_DATA
%
(
"-"
+
version
),
COMPOSITION_EXCLUSIONS
%
(
"-"
+
version
),
EASTASIAN_WIDTH
%
(
"-"
+
version
))
print
(
len
(
filter
(
None
,
old_unicode
.
table
)),
"characters"
)
EASTASIAN_WIDTH
%
(
"-"
+
version
),
DERIVED_CORE_PROPERTIES
%
(
"-"
+
version
))
print
(
len
(
list
(
filter
(
None
,
old_unicode
.
table
))),
"characters"
)
merge_old_version
(
version
,
unicode
,
old_unicode
)
makeunicodename
(
unicode
,
trace
)
...
...
@@ -148,7 +153,7 @@ def makeunicodedata(unicode, trace):
assert
prefix
<
256
# content
decomp
=
[
prefix
+
(
len
(
decomp
)
<<
8
)]
+
\
map
(
lambda
s
:
int
(
s
,
16
),
decomp
)
list
(
map
(
lambda
s
:
int
(
s
,
16
),
decomp
)
)
# Collect NFC pairs
if
not
prefix
and
len
(
decomp
)
==
3
and
\
char
not
in
unicode
.
exclusions
and
\
...
...
@@ -353,6 +358,7 @@ def makeunicodetype(unicode, trace):
# extract database properties
category
=
record
[
2
]
bidirectional
=
record
[
4
]
properties
=
record
[
16
]
flags
=
0
if
category
in
[
"Lm"
,
"Lt"
,
"Lu"
,
"Ll"
,
"Lo"
]:
flags
|=
ALPHA_MASK
...
...
@@ -366,6 +372,10 @@ def makeunicodetype(unicode, trace):
flags
|=
TITLE_MASK
if
category
==
"Lu"
:
flags
|=
UPPER_MASK
if
"XID_Start"
in
properties
:
flags
|=
XID_START_MASK
if
"XID_Continue"
in
properties
:
flags
|=
XID_CONTINUE_MASK
# use delta predictor for upper/lower/title
if
record
[
12
]:
upper
=
int
(
record
[
12
],
16
)
-
char
...
...
@@ -447,7 +457,7 @@ def makeunicodename(unicode, trace):
if
name
and
name
[
0
]
!=
"<"
:
names
[
char
]
=
name
+
chr
(
0
)
print
(
len
(
filter
(
lambda
n
:
n
is
not
None
,
names
)),
"distinct names"
)
print
(
len
(
list
(
filter
(
lambda
n
:
n
is
not
None
,
names
)
)),
"distinct names"
)
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
...
...
@@ -470,10 +480,12 @@ def makeunicodename(unicode, trace):
print
(
n
,
"words in text;"
,
b
,
"bytes"
)
wordlist
=
words
.
items
(
)
wordlist
=
list
(
words
.
items
()
)
# sort on falling frequency, then by name
def
cmpwords
((
aword
,
alist
),(
bword
,
blist
)):
def
cmpwords
(
a
,
b
):
aword
,
alist
=
a
bword
,
blist
=
b
r
=
-
cmp
(
len
(
alist
),
len
(
blist
))
if
r
:
return
r
...
...
@@ -526,7 +538,7 @@ def makeunicodename(unicode, trace):
words
[
w
]
=
len
(
lexicon_offset
)
lexicon_offset
.
append
(
o
)
lexicon
=
map
(
ord
,
lexicon
)
lexicon
=
list
(
map
(
ord
,
lexicon
)
)
# generate phrasebook from names and lexicon
phrasebook
=
[
0
]
...
...
@@ -660,11 +672,14 @@ def merge_old_version(version, new, old):
elif
k
==
14
:
# change to simple titlecase mapping; ignore
pass
elif
k
==
16
:
# derived property changes; not yet
pass
else
:
class
Difference
(
Exception
):
pass
raise
Difference
,
(
hex
(
i
),
k
,
old
.
table
[
i
],
new
.
table
[
i
])
new
.
changed
.
append
((
version
,
zip
(
bidir_changes
,
category_changes
,
decimal_changes
,
numeric_changes
),
new
.
changed
.
append
((
version
,
list
(
zip
(
bidir_changes
,
category_changes
,
decimal_changes
,
numeric_changes
)
)
,
normalization_changes
))
...
...
@@ -677,8 +692,14 @@ def merge_old_version(version, new, old):
import
sys
class
UnicodeData
:
def
__init__
(
self
,
filename
,
exclusions
,
eastasianwidth
,
expand
=
1
):
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
def
__init__
(
self
,
filename
,
exclusions
,
eastasianwidth
,
derivedprops
,
expand
=
1
):
self
.
changed
=
[]
file
=
open
(
filename
)
table
=
[
None
]
*
0x110000
...
...
@@ -742,6 +763,28 @@ class UnicodeData:
if
table
[
i
]
is
not
None
:
table
[
i
].
append
(
widths
[
i
])
for
i
in
range
(
0
,
0x110000
):
if
table
[
i
]
is
not
None
:
table
[
i
].
append
(
set
())
for
s
in
open
(
derivedprops
):
s
=
s
.
split
(
'#'
,
1
)[
0
].
strip
()
if
not
s
:
continue
r
,
p
=
s
.
split
(
";"
)
r
=
r
.
strip
()
p
=
p
.
strip
()
if
".."
in
r
:
first
,
last
=
[
int
(
c
,
16
)
for
c
in
r
.
split
(
'..'
)]
chars
=
range
(
first
,
last
+
1
)
else
:
chars
=
[
int
(
r
,
16
)]
for
char
in
chars
:
if
table
[
char
]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table
[
char
][
-
1
].
add
(
p
)
def
uselatin1
(
self
):
# restrict character range to ISO Latin 1
self
.
chars
=
range
(
256
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment