Add XID_Start and XID_Continue properties to unicodectype.

13c3e380 · Martin v. Löwis · ff398c6f · 13c3e380 · 13c3e380 · 13c3e380
Commit 13c3e380 authored Aug 14, 2007 by Martin v. Löwis
4 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -205,6 +205,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
+# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
+# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
@@ -289,6 +291,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
+# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
+# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
@@ -1274,6 +1278,14 @@ PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
    Py_UNICODE ch 	/* Unicode character */
    );

+PyAPI_FUNC(int) _PyUnicode_IsXidStart(
+    Py_UNICODE ch 	/* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
+    Py_UNICODE ch 	/* Unicode character */
+    );
+
 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
    const Py_UNICODE ch 	/* Unicode character */
    );

--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -19,6 +19,8 @@
 #define SPACE_MASK 0x20
 #define TITLE_MASK 0x40
 #define UPPER_MASK 0x80
+#define XID_START_MASK 0x100
+#define XID_CONTINUE_MASK 0x200

 typedef struct {
    const Py_UNICODE upper;
@@ -98,6 +100,26 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
    return (ctype->flags & TITLE_MASK) != 0;
 }

+/* Returns 1 for Unicode characters having the XID_Start property, 0
+   otherwise. */
+
+int _PyUnicode_IsXidStart(Py_UNICODE ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & XID_START_MASK) != 0;
+}
+
+/* Returns 1 for Unicode characters having the XID_Continue property,
+   0 otherwise. */
+
+int _PyUnicode_IsXidContinue(Py_UNICODE ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & XID_CONTINUE_MASK) != 0;
+}
+
 /* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */


--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -34,6 +34,7 @@ UNIDATA_VERSION = "4.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

 old_versions = ["3.2.0"]

@@ -57,6 +58,8 @@ LINEBREAK_MASK = 0x10
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
+XID_START_MASK = 0x100
+XID_CONTINUE_MASK = 0x200

 def maketables(trace=0):

@@ -65,16 +68,18 @@ def maketables(trace=0):
    version = ""
    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version)
+                          EASTASIAN_WIDTH % version,
+                          DERIVED_CORE_PROPERTIES % version)

-    print(len(filter(None, unicode.table)), "characters")
+    print(len(list(filter(None, unicode.table))), "characters")

    for version in old_versions:
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                  COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version))
-        print(len(filter(None, old_unicode.table)), "characters")
+                                  EASTASIAN_WIDTH % ("-"+version),
+                                  DERIVED_CORE_PROPERTIES % ("-"+version))
+        print(len(list(filter(None, old_unicode.table))), "characters")
        merge_old_version(version, unicode, old_unicode)

    makeunicodename(unicode, trace)
@@ -148,7 +153,7 @@ def makeunicodedata(unicode, trace):
                assert prefix < 256
                # content
                decomp = [prefix + (len(decomp)<<8)] +\
-                         map(lambda s: int(s, 16), decomp)
+                         list(map(lambda s: int(s, 16), decomp))
                # Collect NFC pairs
                if not prefix and len(decomp) == 3 and \
                   char not in unicode.exclusions and \
@@ -353,6 +358,7 @@ def makeunicodetype(unicode, trace):
            # extract database properties
            category = record[2]
            bidirectional = record[4]
+            properties = record[16]
            flags = 0
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
@@ -366,6 +372,10 @@ def makeunicodetype(unicode, trace):
                flags |= TITLE_MASK
            if category == "Lu":
                flags |= UPPER_MASK
+            if "XID_Start" in properties:
+                flags |= XID_START_MASK
+            if "XID_Continue" in properties:
+                flags |= XID_CONTINUE_MASK
            # use delta predictor for upper/lower/title
            if record[12]:
                upper = int(record[12], 16) - char
@@ -447,7 +457,7 @@ def makeunicodename(unicode, trace):
            if name and name[0] != "<":
                names[char] = name + chr(0)

-    print(len(filter(lambda n: n is not None, names)), "distinct names")
+    print(len(list(filter(lambda n: n is not None, names))), "distinct names")

    # collect unique words from names (note that we differ between
    # words inside a sentence, and words ending a sentence.  the
@@ -470,10 +480,12 @@ def makeunicodename(unicode, trace):

    print(n, "words in text;", b, "bytes")

-    wordlist = words.items()
+    wordlist = list(words.items())

    # sort on falling frequency, then by name
-    def cmpwords((aword, alist),(bword, blist)):
+    def cmpwords(a,b):
+        aword, alist = a
+        bword, blist = b
        r = -cmp(len(alist),len(blist))
        if r:
            return r
@@ -526,7 +538,7 @@ def makeunicodename(unicode, trace):
        words[w] = len(lexicon_offset)
        lexicon_offset.append(o)

-    lexicon = map(ord, lexicon)
+    lexicon = list(map(ord, lexicon))

    # generate phrasebook from names and lexicon
    phrasebook = [0]
@@ -660,11 +672,14 @@ def merge_old_version(version, new, old):
                    elif k == 14:
                        # change to simple titlecase mapping; ignore
                        pass
+                    elif k == 16:
+                        # derived property changes; not yet
+                        pass
                    else:
                        class Difference(Exception):pass
                        raise Difference, (hex(i), k, old.table[i], new.table[i])
-    new.changed.append((version, zip(bidir_changes, category_changes,
-                                     decimal_changes, numeric_changes),
+    new.changed.append((version, list(zip(bidir_changes, category_changes,
+                                     decimal_changes, numeric_changes)),
                        normalization_changes))


@@ -677,8 +692,14 @@ def merge_old_version(version, new, old):
 import sys

 class UnicodeData:
-
-    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+    # Record structure:
+    # [ID, name, category, combining, bidi, decomp,  (6)
+    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
+    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
+    #  derived-props] (17)
+
+    def __init__(self, filename, exclusions, eastasianwidth,
+                 derivedprops, expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
@@ -742,6 +763,28 @@ class UnicodeData:
            if table[i] is not None:
                table[i].append(widths[i])

+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(set())
+        for s in open(derivedprops):
+            s = s.split('#', 1)[0].strip()
+            if not s:
+                continue
+
+            r, p = s.split(";")
+            r = r.strip()
+            p = p.strip()
+            if ".." in r:
+                first, last = [int(c, 16) for c in r.split('..')]
+                chars = range(first, last+1)
+            else:
+                chars = [int(r, 16)]
+            for char in chars:
+                if table[char]:
+                    # Some properties (e.g. Default_Ignorable_Code_Point)
+                    # apply to unassigned code points; ignore them
+                    table[char][-1].add(p)
+
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)