Commit 22b24380 authored by Florent Xicluna's avatar Florent Xicluna

#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to...

#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14.
parent e6410c53
...@@ -24,7 +24,7 @@ class UnicodeMethodsTest(unittest.TestCase): ...@@ -24,7 +24,7 @@ class UnicodeMethodsTest(unittest.TestCase):
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
for i in range(65536): for i in range(0x10000):
char = unichr(i) char = unichr(i)
data = [ data = [
# Predicates (single char) # Predicates (single char)
...@@ -282,6 +282,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest): ...@@ -282,6 +282,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
self.assertEqual(u"\u01c5".title(), u"\u01c5") self.assertEqual(u"\u01c5".title(), u"\u01c5")
self.assertEqual(u"\u01c6".title(), u"\u01c5") self.assertEqual(u"\u01c6".title(), u"\u01c5")
def test_linebreak_7643(self):
for i in range(0x10000):
lines = (unichr(i) + u'A').splitlines()
if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
self.assertEqual(len(lines), 2,
r"\u%.4x should be a linebreak" % i)
else:
self.assertEqual(len(lines), 1,
r"\u%.4x should not be a linebreak" % i)
def test_main(): def test_main():
test.test_support.run_unittest( test.test_support.run_unittest(
UnicodeMiscTest, UnicodeMiscTest,
......
...@@ -32,6 +32,10 @@ Core and Builtins ...@@ -32,6 +32,10 @@ Core and Builtins
Library Library
------- -------
- Issue #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks
according to Unicode Standard Annex #14.
http://www.unicode.org/reports/tr14/
- Comparisons using one of <, <=, >, >= between a complex instance and - Comparisons using one of <, <=, >, >= between a complex instance and
a Fractions instance now raise TypeError instead of returning a Fractions instance now raise TypeError instead of returning
True/False. This makes Fraction <=> complex comparisons consistent with True/False. This makes Fraction <=> complex comparisons consistent with
......
...@@ -115,9 +115,9 @@ static char unicode_default_encoding[100]; ...@@ -115,9 +115,9 @@ static char unicode_default_encoding[100];
/* Fast detection of the most frequent whitespace characters */ /* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = { const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* case 0x0009: * HORIZONTAL TABULATION */ /* case 0x0009: * CHARACTER TABULATION */
/* case 0x000A: * LINE FEED */ /* case 0x000A: * LINE FEED */
/* case 0x000B: * VERTICAL TABULATION */ /* case 0x000B: * LINE TABULATION */
/* case 0x000C: * FORM FEED */ /* case 0x000C: * FORM FEED */
/* case 0x000D: * CARRIAGE RETURN */ /* case 0x000D: * CARRIAGE RETURN */
0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
...@@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = { ...@@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = {
static unsigned char ascii_linebreak[] = { static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x000A, * LINE FEED */ /* 0x000A, * LINE FEED */
/* 0x000B, * LINE TABULATION */
/* 0x000C, * FORM FEED */
/* 0x000D, * CARRIAGE RETURN */ /* 0x000D, * CARRIAGE RETURN */
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x001C, * FILE SEPARATOR */ /* 0x001C, * FILE SEPARATOR */
/* 0x001D, * GROUP SEPARATOR */ /* 0x001D, * GROUP SEPARATOR */
......
...@@ -661,7 +661,7 @@ static unsigned char index1[] = { ...@@ -661,7 +661,7 @@ static unsigned char index1[] = {
}; };
static unsigned char index2[] = { static unsigned char index2[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
...@@ -3313,13 +3313,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) ...@@ -3313,13 +3313,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
#endif #endif
} }
/* Returns 1 for Unicode characters having the category 'Zl', /* Returns 1 for Unicode characters having the line break
* 'Zp' or type 'B', 0 otherwise. * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
* type 'B', 0 otherwise.
*/ */
int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
{ {
switch (ch) { switch (ch) {
case 0x000A: case 0x000A:
case 0x000B:
case 0x000C:
case 0x000D: case 0x000D:
case 0x001C: case 0x001C:
case 0x001D: case 0x001D:
......
...@@ -36,6 +36,7 @@ COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" ...@@ -36,6 +36,7 @@ COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt" UNIHAN = "Unihan%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
old_versions = ["3.2.0"] old_versions = ["3.2.0"]
...@@ -50,6 +51,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", ...@@ -50,6 +51,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
# note: should match definitions in Objects/unicodectype.c # note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01 ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02 DECIMAL_MASK = 0x02
...@@ -71,7 +74,8 @@ def maketables(trace=0): ...@@ -71,7 +74,8 @@ def maketables(trace=0):
COMPOSITION_EXCLUSIONS % version, COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version, EASTASIAN_WIDTH % version,
UNIHAN % version, UNIHAN % version,
DERIVEDNORMALIZATION_PROPS % version) DERIVEDNORMALIZATION_PROPS % version,
LINE_BREAK % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
...@@ -113,7 +117,7 @@ def makeunicodedata(unicode, trace): ...@@ -113,7 +117,7 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y" mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
normalizationquickcheck = record[16] normalizationquickcheck = record[17]
item = ( item = (
category, combining, bidirectional, mirrored, eastasianwidth, category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck normalizationquickcheck
...@@ -365,13 +369,14 @@ def makeunicodetype(unicode, trace): ...@@ -365,13 +369,14 @@ def makeunicodetype(unicode, trace):
# extract database properties # extract database properties
category = record[2] category = record[2]
bidirectional = record[4] bidirectional = record[4]
properties = record[16]
flags = 0 flags = 0
delta = True delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK flags |= ALPHA_MASK
if category == "Ll": if category == "Ll":
flags |= LOWER_MASK flags |= LOWER_MASK
if category == "Zl" or bidirectional == "B": if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK flags |= LINEBREAK_MASK
linebreaks.append(char) linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"): if category == "Zs" or bidirectional in ("WS", "B", "S"):
...@@ -524,8 +529,9 @@ def makeunicodetype(unicode, trace): ...@@ -524,8 +529,9 @@ def makeunicodetype(unicode, trace):
print >>fp print >>fp
# Generate code for _PyUnicode_IsLinebreak() # Generate code for _PyUnicode_IsLinebreak()
print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl'," print >>fp, "/* Returns 1 for Unicode characters having the line break"
print >>fp, " * 'Zp' or type 'B', 0 otherwise." print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
print >>fp, " * type 'B', 0 otherwise."
print >>fp, " */" print >>fp, " */"
print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
print >>fp, '{' print >>fp, '{'
...@@ -787,6 +793,9 @@ def merge_old_version(version, new, old): ...@@ -787,6 +793,9 @@ def merge_old_version(version, new, old):
elif k == 14: elif k == 14:
# change to simple titlecase mapping; ignore # change to simple titlecase mapping; ignore
pass pass
elif k == 16:
# change to properties; not yet
pass
else: else:
class Difference(Exception):pass class Difference(Exception):pass
raise Difference, (hex(i), k, old.table[i], new.table[i]) raise Difference, (hex(i), k, old.table[i], new.table[i])
...@@ -803,9 +812,15 @@ def merge_old_version(version, new, old): ...@@ -803,9 +812,15 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk # load a unicode-data file from disk
class UnicodeData: class UnicodeData:
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# properties] (17)
def __init__(self, filename, exclusions, eastasianwidth, unihan, def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivednormalizationprops=None, expand=1): derivednormalizationprops=None, linebreakprops=None,
expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open(filename)
table = [None] * 0x110000 table = [None] * 0x110000
...@@ -868,6 +883,23 @@ class UnicodeData: ...@@ -868,6 +883,23 @@ class UnicodeData:
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
table[i].append(widths[i]) table[i].append(widths[i])
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
if linebreakprops:
for s in open(linebreakprops):
s = s.partition('#')[0]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
table[char][-1].add('Line_Break')
if derivednormalizationprops: if derivednormalizationprops:
quickchecks = [0] * 0x110000 # default is Yes quickchecks = [0] * 0x110000 # default is Yes
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment