Issue #8559: improve unicode support of (gdb) libpython.py

* Escape non printable characters (use locale.getpreferredencoding()) * Fix support of surrogate pairs * test_gdb.py: use ascii() instead of repr() in gdb program arguments to avoid encoding issues * Fix test_strings() of test_gdb.py for encoding different than UTF-8 (eg. ACSII)

Issue #8559: improve unicode support of (gdb) libpython.py
* Escape non printable characters (use locale.getpreferredencoding()) * Fix support of surrogate pairs * test_gdb.py: use ascii() instead of repr() in gdb program arguments to avoid encoding issues * Fix test_strings() of test_gdb.py for encoding different than UTF-8 (eg. ACSII)
150016fd · Victor Stinner · 06710a84 · 150016fd · 150016fd
Commit 150016fd authored May 19, 2010 by Victor Stinner
Show whitespace changes
Inline Side-by-side

Showing with 61 additions and 39 deletions

Lib/test/test_gdb.py Lib/test/test_gdb.py +15 -10

Tools/gdb/libpython.py Tools/gdb/libpython.py +46 -29

No files found.
--- a/Lib/test/test_gdb.py
+++ b/Lib/test/test_gdb.py
@@ -8,6 +8,7 @@ import re
 import subprocess
 import sys
 import unittest
+import locale
 from test.support import run_unittest, findfile
@@ -177,7 +178,7 @@ class PrettyPrintTests(DebuggerTests):
    def assertGdbRepr(self, val, exp_repr=None, cmds_after_breakpoint=None):
        # Ensure that gdb's rendering of the value in a debugged process
        # matches repr(value) in this process:
-        gdb_repr, gdb_output = self.get_gdb_repr('id(' + repr(val) + ')',
+        gdb_repr, gdb_output = self.get_gdb_repr('id(' + ascii(val) + ')',
                                                 cmds_after_breakpoint)
        if not exp_repr:
            exp_repr = repr(val)
@@ -226,31 +227,35 @@ class PrettyPrintTests(DebuggerTests):
    def test_strings(self):
        'Verify the pretty-printing of unicode strings'
+        encoding = locale.getpreferredencoding()
+        def check_repr(text):
+            try:
+                text.encode(encoding)
+                printable = True
+            except UnicodeEncodeError:
+                self.assertGdbRepr(text, ascii(text))
+            else:
+                self.assertGdbRepr(text)
        self.assertGdbRepr('')
        self.assertGdbRepr('And now for something hopefully the same')
        self.assertGdbRepr('string with embedded NUL here \0 and then some more text')
        # Test printing a single character:
        #    U+2620 SKULL AND CROSSBONES
-        self.assertGdbRepr('\u2620')
+        check_repr('\u2620')
        # Test printing a Japanese unicode string
        # (I believe this reads "mojibake", using 3 characters from the CJK
        # Unified Ideographs area, followed by U+3051 HIRAGANA LETTER KE)
-        self.assertGdbRepr('\u6587\u5b57\u5316\u3051')
+        check_repr('\u6587\u5b57\u5316\u3051')
        # Test a character outside the BMP:
        #    U+1D121 MUSICAL SYMBOL C CLEF
        # This is:
        # UTF-8: 0xF0 0x9D 0x84 0xA1
        # UTF-16: 0xD834 0xDD21
-        if sys.maxunicode == 0x10FFFF:
+        check_repr(chr(0x1D121))
-            # wide unicode:
-            self.assertGdbRepr(chr(0x1D121))
-        else:
-            # narrow unicode:
-            self.assertGdbRepr(chr(0x1D121),
-                               "'\\U0000d834\\U0000dd21'")
    def test_tuples(self):
        'Verify the pretty-printing of tuples'

--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@@ -42,6 +42,7 @@ The module also extends gdb with some python-specific commands.
 '''
 from __future__ import with_statement
 import gdb
+import locale
 # Look up the gdb.Type for some standard types:
 _type_char_ptr = gdb.lookup_type('char').pointer() # char*
@@ -69,6 +70,7 @@ MAX_OUTPUT_LEN=1024
 hexdigits = "0123456789abcdef"
+ENCODING = locale.getpreferredencoding()
 class NullPyObjectPtr(RuntimeError):
    pass
@@ -1128,52 +1130,67 @@ class PyUnicodeObjectPtr(PyObjectPtr):
            # Non-ASCII characters
            else:
-                ucs = ch;
+                ucs = ch
+                orig_ucs = None
-                if self.char_width == 2:
+                if self.char_width() == 2:
-                    ch2 = 0
                    # Get code point from surrogate pair
-                    if i < len(proxy):
+                    if (i < len(proxy)
+                    and 0xD800 <= ord(ch) < 0xDC00 \
+                    and 0xDC00 <= ord(proxy[i]) <= 0xDFFF):
                        ch2 = proxy[i]
-                        if (ord(ch) >= 0xD800 and ord(ch) < 0xDC00
+                        code = (ord(ch) & 0x03FF) << 10
-                            and ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF):
+                        code |= ord(ch2) & 0x03FF
-                            ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+                        code += 0x00010000
+                        orig_ucs = ucs
+                        ucs = unichr(code)
                        i += 1
+                    else:
+                        ch2 = None
+                printable = _unichr_is_printable(ucs)
+                if printable:
+                    try:
+                        ucs.encode(ENCODING)
+                    except UnicodeEncodeError:
+                        printable = False
+                        if orig_ucs is not None:
+                            ucs = orig_ucs
+                            i -= 1
                # Map Unicode whitespace and control characters
                # (categories Z* and C* except ASCII space)
-                if not _unichr_is_printable(ucs):
+                if not printable:
                    # Unfortuately, Python 2's unicode type doesn't seem
                    # to expose the "isprintable" method
+                    code = ord(ucs)
                    # Map 8-bit characters to '\\xhh'
-                    if ucs <= 0xff:
+                    if code <= 0xff:
                        out.write('\\x')
-                        out.write(hexdigits[(ord(ucs) >> 4) & 0x000F])
+                        out.write(hexdigits[(code >> 4) & 0x000F])
-                        out.write(hexdigits[ord(ucs) & 0x000F])
+                        out.write(hexdigits[code & 0x000F])
                    # Map 21-bit characters to '\U00xxxxxx'
-                    elif ucs >= 0x10000:
+                    elif code >= 0x10000:
                        out.write('\\U')
-                        out.write(hexdigits[(ord(ucs) >> 28) & 0x0000000F])
+                        out.write(hexdigits[(code >> 28) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 24) & 0x0000000F])
+                        out.write(hexdigits[(code >> 24) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 20) & 0x0000000F])
+                        out.write(hexdigits[(code >> 20) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 16) & 0x0000000F])
+                        out.write(hexdigits[(code >> 16) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 12) & 0x0000000F])
+                        out.write(hexdigits[(code >> 12) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 8) & 0x0000000F])
+                        out.write(hexdigits[(code >> 8) & 0x0000000F])
-                        out.write(hexdigits[(ord(ucs) >> 4) & 0x0000000F])
+                        out.write(hexdigits[(code >> 4) & 0x0000000F])
-                        out.write(hexdigits[ord(ucs) & 0x0000000F])
+                        out.write(hexdigits[code & 0x0000000F])
                    # Map 16-bit characters to '\uxxxx'
                    else:
                        out.write('\\u')
-                        out.write(hexdigits[(ord(ucs) >> 12) & 0x000F])
+                        out.write(hexdigits[(code >> 12) & 0x000F])
-                        out.write(hexdigits[(ord(ucs) >> 8) & 0x000F])
+                        out.write(hexdigits[(code >> 8) & 0x000F])
-                        out.write(hexdigits[(ord(ucs) >> 4) & 0x000F])
+                        out.write(hexdigits[(code >> 4) & 0x000F])
-                        out.write(hexdigits[ord(ucs) & 0x000F])
+                        out.write(hexdigits[code & 0x000F])
                else:
                    # Copy characters as-is
                    out.write(ch)
-                    if self.char_width == 2:
+                    if self.char_width() == 2 and (ch2 is not None):
-                        if ord(ucs) >= 0x10000:
                        out.write(ch2)
        out.write(quote)