Commit 150016fd authored by Victor Stinner's avatar Victor Stinner

Issue #8559: improve unicode support of (gdb) libpython.py

 * Escape non printable characters (use locale.getpreferredencoding())
 * Fix support of surrogate pairs
 * test_gdb.py: use ascii() instead of repr() in gdb program arguments to avoid
   encoding issues
 * Fix test_strings() of test_gdb.py for encoding different than UTF-8
   (eg. ACSII)
parent 06710a84
...@@ -8,6 +8,7 @@ import re ...@@ -8,6 +8,7 @@ import re
import subprocess import subprocess
import sys import sys
import unittest import unittest
import locale
from test.support import run_unittest, findfile from test.support import run_unittest, findfile
...@@ -177,7 +178,7 @@ class PrettyPrintTests(DebuggerTests): ...@@ -177,7 +178,7 @@ class PrettyPrintTests(DebuggerTests):
def assertGdbRepr(self, val, exp_repr=None, cmds_after_breakpoint=None): def assertGdbRepr(self, val, exp_repr=None, cmds_after_breakpoint=None):
# Ensure that gdb's rendering of the value in a debugged process # Ensure that gdb's rendering of the value in a debugged process
# matches repr(value) in this process: # matches repr(value) in this process:
gdb_repr, gdb_output = self.get_gdb_repr('id(' + repr(val) + ')', gdb_repr, gdb_output = self.get_gdb_repr('id(' + ascii(val) + ')',
cmds_after_breakpoint) cmds_after_breakpoint)
if not exp_repr: if not exp_repr:
exp_repr = repr(val) exp_repr = repr(val)
...@@ -226,31 +227,35 @@ class PrettyPrintTests(DebuggerTests): ...@@ -226,31 +227,35 @@ class PrettyPrintTests(DebuggerTests):
def test_strings(self): def test_strings(self):
'Verify the pretty-printing of unicode strings' 'Verify the pretty-printing of unicode strings'
encoding = locale.getpreferredencoding()
def check_repr(text):
try:
text.encode(encoding)
printable = True
except UnicodeEncodeError:
self.assertGdbRepr(text, ascii(text))
else:
self.assertGdbRepr(text)
self.assertGdbRepr('') self.assertGdbRepr('')
self.assertGdbRepr('And now for something hopefully the same') self.assertGdbRepr('And now for something hopefully the same')
self.assertGdbRepr('string with embedded NUL here \0 and then some more text') self.assertGdbRepr('string with embedded NUL here \0 and then some more text')
# Test printing a single character: # Test printing a single character:
# U+2620 SKULL AND CROSSBONES # U+2620 SKULL AND CROSSBONES
self.assertGdbRepr('\u2620') check_repr('\u2620')
# Test printing a Japanese unicode string # Test printing a Japanese unicode string
# (I believe this reads "mojibake", using 3 characters from the CJK # (I believe this reads "mojibake", using 3 characters from the CJK
# Unified Ideographs area, followed by U+3051 HIRAGANA LETTER KE) # Unified Ideographs area, followed by U+3051 HIRAGANA LETTER KE)
self.assertGdbRepr('\u6587\u5b57\u5316\u3051') check_repr('\u6587\u5b57\u5316\u3051')
# Test a character outside the BMP: # Test a character outside the BMP:
# U+1D121 MUSICAL SYMBOL C CLEF # U+1D121 MUSICAL SYMBOL C CLEF
# This is: # This is:
# UTF-8: 0xF0 0x9D 0x84 0xA1 # UTF-8: 0xF0 0x9D 0x84 0xA1
# UTF-16: 0xD834 0xDD21 # UTF-16: 0xD834 0xDD21
if sys.maxunicode == 0x10FFFF: check_repr(chr(0x1D121))
# wide unicode:
self.assertGdbRepr(chr(0x1D121))
else:
# narrow unicode:
self.assertGdbRepr(chr(0x1D121),
"'\\U0000d834\\U0000dd21'")
def test_tuples(self): def test_tuples(self):
'Verify the pretty-printing of tuples' 'Verify the pretty-printing of tuples'
......
...@@ -42,6 +42,7 @@ The module also extends gdb with some python-specific commands. ...@@ -42,6 +42,7 @@ The module also extends gdb with some python-specific commands.
''' '''
from __future__ import with_statement from __future__ import with_statement
import gdb import gdb
import locale
# Look up the gdb.Type for some standard types: # Look up the gdb.Type for some standard types:
_type_char_ptr = gdb.lookup_type('char').pointer() # char* _type_char_ptr = gdb.lookup_type('char').pointer() # char*
...@@ -69,6 +70,7 @@ MAX_OUTPUT_LEN=1024 ...@@ -69,6 +70,7 @@ MAX_OUTPUT_LEN=1024
hexdigits = "0123456789abcdef" hexdigits = "0123456789abcdef"
ENCODING = locale.getpreferredencoding()
class NullPyObjectPtr(RuntimeError): class NullPyObjectPtr(RuntimeError):
pass pass
...@@ -1128,52 +1130,67 @@ class PyUnicodeObjectPtr(PyObjectPtr): ...@@ -1128,52 +1130,67 @@ class PyUnicodeObjectPtr(PyObjectPtr):
# Non-ASCII characters # Non-ASCII characters
else: else:
ucs = ch; ucs = ch
orig_ucs = None
if self.char_width == 2: if self.char_width() == 2:
ch2 = 0
# Get code point from surrogate pair # Get code point from surrogate pair
if i < len(proxy): if (i < len(proxy)
and 0xD800 <= ord(ch) < 0xDC00 \
and 0xDC00 <= ord(proxy[i]) <= 0xDFFF):
ch2 = proxy[i] ch2 = proxy[i]
if (ord(ch) >= 0xD800 and ord(ch) < 0xDC00 code = (ord(ch) & 0x03FF) << 10
and ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF): code |= ord(ch2) & 0x03FF
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; code += 0x00010000
orig_ucs = ucs
ucs = unichr(code)
i += 1 i += 1
else:
ch2 = None
printable = _unichr_is_printable(ucs)
if printable:
try:
ucs.encode(ENCODING)
except UnicodeEncodeError:
printable = False
if orig_ucs is not None:
ucs = orig_ucs
i -= 1
# Map Unicode whitespace and control characters # Map Unicode whitespace and control characters
# (categories Z* and C* except ASCII space) # (categories Z* and C* except ASCII space)
if not _unichr_is_printable(ucs): if not printable:
# Unfortuately, Python 2's unicode type doesn't seem # Unfortuately, Python 2's unicode type doesn't seem
# to expose the "isprintable" method # to expose the "isprintable" method
code = ord(ucs)
# Map 8-bit characters to '\\xhh' # Map 8-bit characters to '\\xhh'
if ucs <= 0xff: if code <= 0xff:
out.write('\\x') out.write('\\x')
out.write(hexdigits[(ord(ucs) >> 4) & 0x000F]) out.write(hexdigits[(code >> 4) & 0x000F])
out.write(hexdigits[ord(ucs) & 0x000F]) out.write(hexdigits[code & 0x000F])
# Map 21-bit characters to '\U00xxxxxx' # Map 21-bit characters to '\U00xxxxxx'
elif ucs >= 0x10000: elif code >= 0x10000:
out.write('\\U') out.write('\\U')
out.write(hexdigits[(ord(ucs) >> 28) & 0x0000000F]) out.write(hexdigits[(code >> 28) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 24) & 0x0000000F]) out.write(hexdigits[(code >> 24) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 20) & 0x0000000F]) out.write(hexdigits[(code >> 20) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 16) & 0x0000000F]) out.write(hexdigits[(code >> 16) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 12) & 0x0000000F]) out.write(hexdigits[(code >> 12) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 8) & 0x0000000F]) out.write(hexdigits[(code >> 8) & 0x0000000F])
out.write(hexdigits[(ord(ucs) >> 4) & 0x0000000F]) out.write(hexdigits[(code >> 4) & 0x0000000F])
out.write(hexdigits[ord(ucs) & 0x0000000F]) out.write(hexdigits[code & 0x0000000F])
# Map 16-bit characters to '\uxxxx' # Map 16-bit characters to '\uxxxx'
else: else:
out.write('\\u') out.write('\\u')
out.write(hexdigits[(ord(ucs) >> 12) & 0x000F]) out.write(hexdigits[(code >> 12) & 0x000F])
out.write(hexdigits[(ord(ucs) >> 8) & 0x000F]) out.write(hexdigits[(code >> 8) & 0x000F])
out.write(hexdigits[(ord(ucs) >> 4) & 0x000F]) out.write(hexdigits[(code >> 4) & 0x000F])
out.write(hexdigits[ord(ucs) & 0x000F]) out.write(hexdigits[code & 0x000F])
else: else:
# Copy characters as-is # Copy characters as-is
out.write(ch) out.write(ch)
if self.char_width == 2: if self.char_width() == 2 and (ch2 is not None):
if ord(ucs) >= 0x10000:
out.write(ch2) out.write(ch2)
out.write(quote) out.write(quote)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment