Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
938ac765
Commit
938ac765
authored
Jan 21, 2001
by
Fredrik Lundh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
forgot to check in the new makeunicodedata.py script
parent
d3099536
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
8294 additions
and
8383 deletions
+8294
-8383
Modules/ucnhash.c
Modules/ucnhash.c
+8
-10
Modules/unicodedata_db.h
Modules/unicodedata_db.h
+1
-1
Modules/unicodename_db.h
Modules/unicodename_db.h
+8013
-8354
Objects/unicodetype_db.h
Objects/unicodetype_db.h
+1
-1
Tools/unicode/makeunicodedata.py
Tools/unicode/makeunicodedata.py
+271
-17
No files found.
Modules/ucnhash.c
View file @
938ac765
...
...
@@ -11,16 +11,13 @@
/* database code (cut and pasted from the unidb package) */
static
unsigned
long
gethash
(
const
char
*
s
,
int
len
)
gethash
(
const
char
*
s
,
int
len
,
int
scale
)
{
int
i
;
unsigned
long
h
=
0
;
unsigned
long
ix
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
/* magic value 47 was chosen to minimize the number
of collisions for the uninames dataset. see the
makeunicodedata script for more background */
h
=
(
h
*
47
)
+
(
unsigned
char
)
toupper
(
s
[
i
]);
h
=
(
h
*
scale
)
+
(
unsigned
char
)
toupper
(
s
[
i
]);
ix
=
h
&
0xff000000
;
if
(
ix
)
h
=
(
h
^
((
ix
>>
24
)
&
0xff
))
&
0x00ffffff
;
...
...
@@ -40,8 +37,9 @@ getname(Py_UCS4 code, char* buffer, int buflen)
return
0
;
/* get offset into phrasebook */
offset
=
phrasebook_offset1
[(
code
>>
SHIFT
)];
offset
=
phrasebook_offset2
[(
offset
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
offset
=
phrasebook_offset1
[(
code
>>
phrasebook_shift
)];
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
(
code
&
((
1
<<
phrasebook_shift
)
-
1
))];
if
(
!
offset
)
return
0
;
...
...
@@ -99,14 +97,14 @@ static int
getcode
(
const
char
*
name
,
int
namelen
,
Py_UCS4
*
code
)
{
unsigned
int
h
,
v
;
unsigned
int
mask
=
CODE_SIZE
-
1
;
unsigned
int
mask
=
code_size
-
1
;
unsigned
int
i
,
incr
;
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h
=
(
unsigned
int
)
gethash
(
name
,
namelen
);
h
=
(
unsigned
int
)
gethash
(
name
,
namelen
,
code_magic
);
i
=
(
~
h
)
&
mask
;
v
=
code_hash
[
i
];
if
(
!
v
)
...
...
@@ -129,7 +127,7 @@ getcode(const char* name, int namelen, Py_UCS4* code)
}
incr
=
incr
<<
1
;
if
(
incr
>
mask
)
incr
=
incr
^
CODE_POLY
;
incr
=
incr
^
code_poly
;
}
}
...
...
Modules/unicodedata_db.h
View file @
938ac765
/* this file was generated by tools\unicode\makeunicodedata.py
1
.1 */
/* this file was generated by tools\unicode\makeunicodedata.py
2
.1 */
/* a list of unique database records */
const
_PyUnicode_DatabaseRecord
_PyUnicode_Database_Records
[]
=
{
...
...
Modules/unicodename_db.h
View file @
938ac765
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Objects/unicodetype_db.h
View file @
938ac765
/* this file was generated by tools\unicode\makeunicodedata.py
1
.1 */
/* this file was generated by tools\unicode\makeunicodedata.py
2
.1 */
/* a list of unique character type descriptors */
const
_PyUnicode_TypeRecord
_PyUnicode_TypeRecords
[]
=
{
...
...
Tools/unicode/makeunicodedata.py
View file @
938ac765
...
...
@@ -2,14 +2,16 @@
# (re)generate unicode property and type databases
#
# this script converts a unicode 3.0 database file to
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
# Modules/unicodedata_db.h, Modules/unicodename_db.h,
# and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
(2.0)
# 2000-11-03 fl expand first/last ranges
# 2001-01-19 fl added character name tables (2.1)
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
...
...
@@ -17,7 +19,7 @@
import
sys
SCRIPT
=
sys
.
argv
[
0
]
VERSION
=
"
1
.1"
VERSION
=
"
2
.1"
UNICODE_DATA
=
"UnicodeData-Latest.txt"
...
...
@@ -42,18 +44,32 @@ UPPER_MASK = 0x80
def
maketables
(
trace
=
0
):
print
"--- Reading"
,
UNICODE_DATA
,
"..."
unicode
=
UnicodeData
(
UNICODE_DATA
)
print
"--- Processing"
,
UNICODE_DATA
,
"..."
print
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
# extract unicode properties
makeunicodedata
(
unicode
,
trace
)
makeunicodetype
(
unicode
,
trace
)
makeunicodename
(
unicode
,
trace
)
# --------------------------------------------------------------------
# unicode character properties
def
makeunicodedata
(
unicode
,
trace
):
dummy
=
(
0
,
0
,
0
,
0
)
table
=
[
dummy
]
cache
=
{
0
:
dummy
}
index
=
[
0
]
*
len
(
unicode
.
chars
)
FILE
=
"Modules/unicodedata_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# 1) database properties
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
...
...
@@ -93,13 +109,11 @@ def maketables(trace=0):
i
=
0
decomp_index
[
char
]
=
i
FILE
=
"Modules/unicodedata_db.h"
print
"--- Writing"
,
FILE
,
"..."
print
len
(
table
),
"unique properties"
print
len
(
decomp_data
),
"unique decomposition entries"
print
"--- Writing"
,
FILE
,
"..."
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by %s %s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
...
...
@@ -111,7 +125,7 @@ def maketables(trace=0):
print
>>
fp
,
"};"
print
>>
fp
# FIXME:
the following tables sh
ould be made static, and
# FIXME:
<fl> the following tables c
ould be made static, and
# the support code moved into unicodedatabase.c
print
>>
fp
,
"/* string literals */"
...
...
@@ -149,8 +163,16 @@ def maketables(trace=0):
Array
(
"decomp_index1"
,
index1
).
dump
(
fp
)
Array
(
"decomp_index2"
,
index2
).
dump
(
fp
)
#
# 3) unicode type data
fp
.
close
()
# --------------------------------------------------------------------
# unicode character type tables
def
makeunicodetype
(
unicode
,
trace
):
FILE
=
"Objects/unicodetype_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# extract unicode types
dummy
=
(
0
,
0
,
0
,
0
,
0
,
0
)
...
...
@@ -209,14 +231,11 @@ def maketables(trace=0):
table
.
append
(
item
)
index
[
char
]
=
i
FILE
=
"Objects/unicodetype_db.h"
fp
=
open
(
FILE
,
"w"
)
print
len
(
table
),
"unique character type entries"
print
"--- Writing"
,
FILE
,
"..."
print
len
(
table
),
"unique character type entries"
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by %s %s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
print
>>
fp
,
"/* a list of unique character type descriptors */"
...
...
@@ -234,6 +253,155 @@ def maketables(trace=0):
Array
(
"index1"
,
index1
).
dump
(
fp
)
Array
(
"index2"
,
index2
).
dump
(
fp
)
fp
.
close
()
# --------------------------------------------------------------------
# unicode name database
def
makeunicodename
(
unicode
,
trace
):
FILE
=
"Modules/unicodename_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# collect names
names
=
[
None
]
*
len
(
unicode
.
chars
)
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
name
=
record
[
1
].
strip
()
if
name
and
name
[
0
]
!=
"<"
:
names
[
char
]
=
name
+
chr
(
0
)
print
len
(
filter
(
lambda
n
:
n
is
not
None
,
names
)),
"distinct names"
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
# latter includes the trailing null byte.
words
=
{}
n
=
b
=
0
for
char
in
unicode
.
chars
:
name
=
names
[
char
]
if
name
:
w
=
name
.
split
()
b
=
b
+
len
(
name
)
n
=
n
+
len
(
w
)
for
w
in
w
:
l
=
words
.
get
(
w
)
if
l
:
l
.
append
(
None
)
else
:
words
[
w
]
=
[
len
(
words
)]
print
n
,
"words in text;"
,
b
,
"bytes"
wordlist
=
words
.
items
()
# sort on falling frequency
wordlist
.
sort
(
lambda
a
,
b
:
len
(
b
[
1
])
-
len
(
a
[
1
]))
# statistics
n
=
0
for
i
in
range
(
128
):
n
=
n
+
len
(
wordlist
[
i
][
1
])
print
n
,
"short words (7-bit indices)"
# pick the 128 most commonly used words, and sort the rest on
# falling length (to maximize overlap)
wordlist
,
wordtail
=
wordlist
[:
128
],
wordlist
[
128
:]
wordtail
.
sort
(
lambda
a
,
b
:
len
(
b
[
0
])
-
len
(
a
[
0
]))
wordlist
.
extend
(
wordtail
)
# generate lexicon from words
lexicon_offset
=
[
0
]
lexicon
=
""
words
=
{}
# build a lexicon string
offset
=
0
for
w
,
x
in
wordlist
:
# encoding: bit 7 indicates last character in word (chr(128)
# indicates the last character in an entire string)
ww
=
w
[:
-
1
]
+
chr
(
ord
(
w
[
-
1
])
+
128
)
# reuse string tails, when possible
o
=
string
.
find
(
lexicon
,
ww
)
if
o
<
0
:
o
=
offset
lexicon
=
lexicon
+
ww
offset
=
offset
+
len
(
w
)
words
[
w
]
=
len
(
lexicon_offset
)
lexicon_offset
.
append
(
offset
)
print
len
(
words
),
"words in lexicon;"
,
len
(
lexicon
),
"bytes"
assert
len
(
words
)
<
32768
# 15-bit word indices
lexicon
=
map
(
ord
,
lexicon
)
# generate phrasebook from names and lexicon
phrasebook
=
[
0
]
phrasebook_offset
=
[
0
]
*
len
(
unicode
.
chars
)
for
char
in
unicode
.
chars
:
name
=
names
[
char
]
if
name
:
w
=
name
.
split
()
phrasebook_offset
[
char
]
=
len
(
phrasebook
)
for
w
in
w
:
i
=
words
[
w
]
if
i
<
128
:
phrasebook
.
append
(
128
+
i
)
else
:
phrasebook
.
append
(
i
>>
8
)
phrasebook
.
append
(
i
&
255
)
#
# unicode name hash table
# extract names
data
=
[]
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
name
=
record
[
1
].
strip
()
if
name
and
name
[
0
]
!=
"<"
:
data
.
append
((
name
,
char
))
# the magic number 47 was chosen to minimize the number of
# collisions on the current data set. if you like, change it
# and see what happens...
codehash
=
Hash
(
"code"
,
data
,
47
)
print
"--- Writing"
,
FILE
,
"..."
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by %s %s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
print
>>
fp
,
"#define NAME_MAXLEN"
,
256
print
>>
fp
print
>>
fp
,
"/* lexicon */"
Array
(
"lexicon"
,
lexicon
).
dump
(
fp
)
Array
(
"lexicon_offset"
,
lexicon_offset
).
dump
(
fp
)
# split decomposition index table
offset1
,
offset2
,
shift
=
splitbins
(
phrasebook_offset
,
trace
)
print
>>
fp
,
"/* code->name phrasebook */"
print
>>
fp
,
"#define phrasebook_shift"
,
shift
Array
(
"phrasebook"
,
phrasebook
).
dump
(
fp
)
Array
(
"phrasebook_offset1"
,
offset1
).
dump
(
fp
)
Array
(
"phrasebook_offset2"
,
offset2
).
dump
(
fp
)
print
>>
fp
,
"/* name->code dictionary */"
codehash
.
dump
(
fp
)
fp
.
close
()
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
...
...
@@ -280,6 +448,92 @@ class UnicodeData:
# restrict character range to ISO Latin 1
self
.
chars
=
range
(
256
)
# hash table tools
# this is a straight-forward reimplementation of Python's built-in
# dictionary type, using a static data structure, and a custom string
# hash algorithm.
def
myhash
(
s
,
magic
):
h
=
0
for
c
in
map
(
ord
,
string
.
upper
(
s
)):
h
=
(
h
*
magic
)
+
c
ix
=
h
&
0xff000000
if
ix
:
h
=
(
h
^
((
ix
>>
24
)
&
0xff
))
&
0x00ffffff
return
h
SIZES
=
[
(
4
,
3
),
(
8
,
3
),
(
16
,
3
),
(
32
,
5
),
(
64
,
3
),
(
128
,
3
),
(
256
,
29
),
(
512
,
17
),
(
1024
,
9
),
(
2048
,
5
),
(
4096
,
83
),
(
8192
,
27
),
(
16384
,
43
),
(
32768
,
3
),
(
65536
,
45
),
(
131072
,
9
),
(
262144
,
39
),
(
524288
,
39
),
(
1048576
,
9
),
(
2097152
,
5
),
(
4194304
,
3
),
(
8388608
,
33
),
(
16777216
,
27
)
]
class
Hash
:
def
__init__
(
self
,
name
,
data
,
magic
):
# turn a (key, value) list into a static hash table structure
# determine table size
for
size
,
poly
in
SIZES
:
if
size
>
len
(
data
):
poly
=
size
+
poly
break
else
:
raise
AssertionError
,
"ran out of polynominals"
print
size
,
"slots in hash table"
table
=
[
None
]
*
size
mask
=
size
-
1
n
=
0
hash
=
myhash
# initialize hash table
for
key
,
value
in
data
:
h
=
hash
(
key
,
magic
)
i
=
(
~
h
)
&
mask
v
=
table
[
i
]
if
v
is
None
:
table
[
i
]
=
value
continue
incr
=
(
h
^
(
h
>>
3
))
&
mask
;
if
not
incr
:
incr
=
mask
while
1
:
n
=
n
+
1
i
=
(
i
+
incr
)
&
mask
v
=
table
[
i
]
if
v
is
None
:
table
[
i
]
=
value
break
incr
=
incr
<<
1
if
incr
>
mask
:
incr
=
incr
^
poly
print
n
,
"collisions"
self
.
collisions
=
n
for
i
in
range
(
len
(
table
)):
if
table
[
i
]
is
None
:
table
[
i
]
=
0
self
.
data
=
Array
(
name
+
"_hash"
,
table
)
self
.
magic
=
magic
self
.
name
=
name
self
.
size
=
size
self
.
poly
=
poly
def
dump
(
self
,
file
):
# write data to file, as a C array
self
.
data
.
dump
(
file
)
file
.
write
(
"#define %s_magic %d
\
n
"
%
(
self
.
name
,
self
.
magic
))
file
.
write
(
"#define %s_size %d
\
n
"
%
(
self
.
name
,
self
.
size
))
file
.
write
(
"#define %s_poly %d
\
n
"
%
(
self
.
name
,
self
.
poly
))
# stuff to deal with arrays of unsigned integers
class
Array
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment