Additional test and documentation for the unicode() changes.

This patch should also be applied to the 2.2b1 trunk.

Additional test and documentation for the unicode() changes.
This patch should also be applied to the 2.2b1 trunk.
b5507ecd · Marc-André Lemburg · f6fb171c · b5507ecd · b5507ecd · b5507ecd
Commit b5507ecd authored Oct 19, 2001 by Marc-André Lemburg
5 changed files
--- a/Doc/lib/libfuncs.tex
+++ b/Doc/lib/libfuncs.tex
@@ -758,19 +758,33 @@ def my_import(name):
  \versionadded{2.0}
 \end{funcdesc}

-\begin{funcdesc}{unicode}{string\optional{, encoding\optional{, errors}}}
-  Create a Unicode string from an 8-bit string \var{string} using the
-  codec for \var{encoding}.  The \var{encoding} parameter is a string
-  giving the name of an encoding.  Error handling is done according to
-  \var{errors}; this specifies the treatment of characters which are
-  invalid in the input encoding.  If \var{errors} is \code{'strict'}
-  (the default), a \exception{ValueError} is raised on errors, while a
-  value of \code{'ignore'} causes errors to be silently ignored, and a
-  value of \code{'replace'} causes the official Unicode replacement
-  character, \code{U+FFFD}, to be used to replace input characters
-  which cannot be decoded.  The default behavior is to decode UTF-8 in
-  strict mode, meaning that encoding errors raise
-  \exception{ValueError}.  See also the \refmodule{codecs} module.
+\begin{funcdesc}{unicode}{object\optional{, encoding\optional{, errors}}}
+  Return the Unicode string version of \var{object} using one of the
+  following modes:
+
+  If \var{encoding} and/or \var{errors} are given, \code{unicode()}
+  will decode the object which can either be an 8-bit string or a
+  character buffer using the codec for \var{encoding}. The
+  \var{encoding} parameter is a string giving the name of an encoding.
+  Error handling is done according to \var{errors}; this specifies the
+  treatment of characters which are invalid in the input encoding.  If
+  \var{errors} is \code{'strict'} (the default), a
+  \exception{ValueError} is raised on errors, while a value of
+  \code{'ignore'} causes errors to be silently ignored, and a value of
+  \code{'replace'} causes the official Unicode replacement character,
+  \code{U+FFFD}, to be used to replace input characters which cannot
+  be decoded.  See also the \refmodule{codecs} module.
+
+  If no optional parameters are given, \code{unicode()} will mimic the
+  behaviour of \code{str()} except that it returns Unicode strings
+  instead of 8-bit strings. More precisely, if \var{object} is an
+  Unicode string or subclass it will return a Unicode string without
+  any additional decoding applied. For objects which provide a
+  \code{__unicode__} method, it will call this method without
+  arguments to create a Unicode string. For all other objects, the
+  8-bit string version or representation is requested and then
+  converted to a Unicode string using the codec for the default
+  encoding in \code{'strict'} mode.
  \versionadded{2.0}
 \end{funcdesc}


--- a/Lib/test/output/test_unicode
+++ b/Lib/test/output/test_unicode
@@ -2,6 +2,7 @@ test_unicode
 Testing Unicode comparisons... done.
 Testing Unicode contains method... done.
 Testing Unicode formatting strings... done.
+Testing builtin unicode()... done.
 Testing builtin codecs... done.
 Testing standard mapping codecs... 0-127... 128-255... done.
 Testing Unicode string concatenation... done.
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -389,6 +389,67 @@ verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10   abc')
 verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103   abc')
 print 'done.'

+print 'Testing builtin unicode()...',
+
+# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
+
+verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
+
+class UnicodeSubclass(unicode):
+    pass
+
+verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
+       == u'unicode subclass becomes unicode')
+
+verify(unicode('strings are converted to unicode')
+       == u'strings are converted to unicode')
+
+class UnicodeCompat:
+    def __init__(self, x):
+        self.x = x
+    def __unicode__(self):
+        return self.x
+
+verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
+       == u'__unicode__ compatible objects are recognized')
+
+class StringCompat:
+    def __init__(self, x):
+        self.x = x
+    def __str__(self):
+        return self.x
+
+verify(unicode(StringCompat('__str__ compatible objects are recognized'))
+       == u'__str__ compatible objects are recognized')
+
+# unicode(obj) is compatible to str():
+
+o = StringCompat('unicode(obj) is compatible to str()')
+verify(unicode(o) == u'unicode(obj) is compatible to str()')
+verify(str(o) == 'unicode(obj) is compatible to str()')
+
+for obj in (123, 123.45, 123L):
+    verify(unicode(obj) == unicode(str(obj)))
+
+# unicode(obj, encoding, error) tests (this maps to
+# PyUnicode_FromEncodedObject() at C level)
+
+try:
+    unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
+except TypeError:
+    pass
+else:
+    raise TestFailed, "decoding unicode should NOT be supported"
+
+verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
+       == u'strings are decoded to unicode')
+
+verify(unicode(buffer('character buffers are decoded to unicode'),
+               'utf-8', 'strict')
+       == u'character buffers are decoded to unicode')
+
+print 'done.'
+
 # Test builtin codecs
 print 'Testing builtin codecs...',

@@ -437,32 +498,11 @@ verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
 # * strict decoding testing for all of the
 #   UTF8_ERROR cases in PyUnicode_DecodeUTF8

-
-
 verify(unicode('hello','ascii') == u'hello')
 verify(unicode('hello','utf-8') == u'hello')
 verify(unicode('hello','utf8') == u'hello')
 verify(unicode('hello','latin-1') == u'hello')

-# Compatibility to str():
-class String:
-    x = ''
-    def __str__(self):
-        return self.x
-
-o = String()
-
-o.x = 'abc'
-verify(unicode(o) == u'abc')
-verify(str(o) == 'abc')
-
-o.x = u'abc'
-verify(unicode(o) == u'abc')
-verify(str(o) == 'abc')
-
-for obj in (123, 123.45, 123L):
-    verify(unicode(obj) == unicode(str(obj)))
-
 # Error handling
 try:
    u'Andr\202 x'.encode('ascii')

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -44,7 +44,7 @@ Core and builtins
 - unicode(obj) now behaves more like str(obj), accepting arbitrary
  objects, and calling a __unicode__ method if it exists.
  unicode(obj, encoding) and unicode(obj, encoding, errors) still
-  require an 8-bit string argument.
+  require an 8-bit string or character buffer argument.

 - isinstance() now allows any object as the first argument and a
  class, a type or something with a __bases__ tuple attribute for the

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -426,8 +426,9 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,

 #if 0
    /* For b/w compatibility we also accept Unicode objects provided
-       that no encodings is given and then redirect to PyObject_Unicode() 
-       which then applies the additional logic for Unicode subclasses.
+       that no encodings is given and then redirect to
+       PyObject_Unicode() which then applies the additional logic for
+       Unicode subclasses.

       NOTE: This API should really only be used for object which
             represent *encoded* Unicode !