Merged revisions 79494,79496 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........

Merged revisions 79494,79496 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........
806d8cf0 · Florent Xicluna · 364129ef · 806d8cf0 · 806d8cf0 · 806d8cf0
Commit 806d8cf0 authored Mar 30, 2010 by Florent Xicluna
5 changed files
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -25,7 +25,7 @@ class UnicodeMethodsTest(unittest.TestCase):
    def test_method_checksum(self):
        h = hashlib.sha1()
-        for i in range(65536):
+        for i in range(0x10000):
            char = chr(i)
            data = [
                # Predicates (single char)
@@ -284,6 +284,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
        self.assertEqual("\u01c5".title(), "\u01c5")
        self.assertEqual("\u01c6".title(), "\u01c5")
+    def test_linebreak_7643(self):
+        for i in range(0x10000):
+            lines = (chr(i) + 'A').splitlines()
+            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
+                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
+                self.assertEqual(len(lines), 2,
+                                 r"\u%.4x should be a linebreak" % i)
+            else:
+                self.assertEqual(len(lines), 1,
+                                 r"\u%.4x should not be a linebreak" % i)
 def test_main():
    test.support.run_unittest(
        UnicodeMiscTest,

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -293,6 +293,11 @@ C-API
 Library
 -------
+- Backwards incompatible change: Unicode codepoints line tabulation (0x0B) and
+  form feed (0x0C) are now considered linebreaks, as specified in Unicode
+  Standard Annex #14.  See issue #7643.
+  http://www.unicode.org/reports/tr14/
 - Comparisons using one of <, <=, >, >= between a complex instance and
  a Fractions instance now raise TypeError instead of returning
  True/False.  This makes Fraction <=> complex comparisons consistent with

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -126,9 +126,9 @@ static const char unicode_default_encoding[] = "utf-8";
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
-/*     case 0x0009: * HORIZONTAL TABULATION */
+/*     case 0x0009: * CHARACTER TABULATION */
 /*     case 0x000A: * LINE FEED */
-/*     case 0x000B: * VERTICAL TABULATION */
+/*     case 0x000B: * LINE TABULATION */
 /*     case 0x000C: * FORM FEED */
 /*     case 0x000D: * CARRIAGE RETURN */
    0, 1, 1, 1, 1, 1, 0, 0,
@@ -163,8 +163,10 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
 static unsigned char ascii_linebreak[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x000A, * LINE FEED */
+/*         0x000B, * LINE TABULATION */
+/*         0x000C, * FORM FEED */
 /*         0x000D, * CARRIAGE RETURN */
-    0, 0, 1, 0, 0, 1, 0, 0,
+    0, 0, 1, 1, 1, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x001C, * FILE SEPARATOR */
 /*         0x001D, * GROUP SEPARATOR */

--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -694,7 +694,7 @@ static unsigned char index1[] = {
 };
 static unsigned char index2[] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 5, 5, 5, 5, 5, 5, 16, 16, 16, 16, 
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
@@ -3395,13 +3395,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
 #endif
 }
-/* Returns 1 for Unicode characters having the category 'Zl',
+/* Returns 1 for Unicode characters having the line break
- * 'Zp' or type 'B', 0 otherwise.
+ * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
+ * type 'B', 0 otherwise.
 */
 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
 {
    switch (ch) {
    case 0x000A:
+    case 0x000B:
+    case 0x000C:
    case 0x000D:
    case 0x001C:
    case 0x001D:

--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -38,6 +38,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 UNIHAN = "Unihan%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
+LINE_BREAK = "LineBreak%s.txt"
 old_versions = ["3.2.0"]
@@ -52,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
 # note: should match definitions in Objects/unicodectype.c
 ALPHA_MASK = 0x01
 DECIMAL_MASK = 0x02
@@ -77,7 +80,8 @@ def maketables(trace=0):
                          EASTASIAN_WIDTH % version,
                          UNIHAN % version,
                          DERIVED_CORE_PROPERTIES % version,
-                          DERIVEDNORMALIZATION_PROPS % version)
+                          DERIVEDNORMALIZATION_PROPS % version,
+                          LINE_BREAK % version)
    print(len(list(filter(None, unicode.table))), "characters")
@@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace):
                flags |= ALPHA_MASK
            if category == "Ll":
                flags |= LOWER_MASK
-            if category == "Zl" or bidirectional == "B":
+            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
                linebreaks.append(char)
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
@@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace):
    print(file=fp)
    # Generate code for _PyUnicode_IsLinebreak()
-    print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
+    print("/* Returns 1 for Unicode characters having the line break", file=fp)
-    print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
+    print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
+    print(" * type 'B', 0 otherwise.", file=fp)
    print(" */", file=fp)
    print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
    print('{', file=fp)
@@ -826,7 +831,8 @@ class UnicodeData:
    #  derived-props] (17)
    def __init__(self, filename, exclusions, eastasianwidth, unihan,
-                 derivedprops, derivednormalizationprops=None, expand=1):
+                 derivedprops, derivednormalizationprops=None, linebreakprops=None,
+                 expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
@@ -912,6 +918,19 @@ class UnicodeData:
                    # apply to unassigned code points; ignore them
                    table[char][-1].add(p)
+        if linebreakprops:
+            for s in open(linebreakprops):
+                s = s.partition('#')[0]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                    continue
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    table[char][-1].add('Line_Break')
        if derivednormalizationprops:
            quickchecks = [0] * 0x110000 # default is Yes
            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()