Issue20284: Implement PEP461

b95b5615 · Ethan Furman · 8861502e · b95b5615 · b95b5615 · b95b5615
Commit b95b5615 authored Jan 23, 2015 by Ethan Furman
10 changed files
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -3057,6 +3057,197 @@ place, and instead produce new objects.
      always produces a new object, even if no changes were made.


+.. _bytes-formatting:
+
+``printf``-style Bytes Formatting
+----------------------------------
+
+.. index::
+   single: formatting, bytes (%)
+   single: formatting, bytearray (%)
+   single: interpolation, bytes (%)
+   single: interpolation, bytearray (%)
+   single: bytes; formatting
+   single: bytearray; formatting
+   single: bytes; interpolation
+   single: bytearray; interpolation
+   single: printf-style formatting
+   single: sprintf-style formatting
+   single: % formatting
+   single: % interpolation
+
+.. note::
+
+   The formatting operations described here exhibit a variety of quirks that
+   lead to a number of common errors (such as failing to display tuples and
+   dictionaries correctly).  If the value being printed may be a tuple or
+   dictionary, wrap it in a tuple.
+
+Bytes objects (``bytes``/``bytearray``) have one unique built-in operation:
+the ``%`` operator (modulo).
+This is also known as the bytes *formatting* or *interpolation* operator.
+Given ``format % values`` (where *format* is a bytes object), ``%`` conversion
+specifications in *format* are replaced with zero or more elements of *values*.
+The effect is similar to using the :c:func:`sprintf` in the C language.
+
+If *format* requires a single argument, *values* may be a single non-tuple
+object. [5]_  Otherwise, *values* must be a tuple with exactly the number of
+items specified by the format bytes object, or a single mapping object (for
+example, a dictionary).
+
+A conversion specifier contains two or more characters and has the following
+components, which must occur in this order:
+
+#. The ``'%'`` character, which marks the start of the specifier.
+
+#. Mapping key (optional), consisting of a parenthesised sequence of characters
+   (for example, ``(somename)``).
+
+#. Conversion flags (optional), which affect the result of some conversion
+   types.
+
+#. Minimum field width (optional).  If specified as an ``'*'`` (asterisk), the
+   actual width is read from the next element of the tuple in *values*, and the
+   object to convert comes after the minimum field width and optional precision.
+
+#. Precision (optional), given as a ``'.'`` (dot) followed by the precision.  If
+   specified as ``'*'`` (an asterisk), the actual precision is read from the next
+   element of the tuple in *values*, and the value to convert comes after the
+   precision.
+
+#. Length modifier (optional).
+
+#. Conversion type.
+
+When the right argument is a dictionary (or other mapping type), then the
+formats in the bytes object *must* include a parenthesised mapping key into that
+dictionary inserted immediately after the ``'%'`` character. The mapping key
+selects the value to be formatted from the mapping.  For example:
+
+   >>> print(b'%(language)s has %(number)03d quote types.' %
+   ...       {b'language': b"Python", b"number": 2})
+   b'Python has 002 quote types.'
+
+In this case no ``*`` specifiers may occur in a format (since they require a
+sequential parameter list).
+
+The conversion flag characters are:
+
+---------+---------------------------------------------------------------------+
+| Flag    | Meaning                                                             |
+=========+=====================================================================+
+| ``'#'`` | The value conversion will use the "alternate form" (where defined   |
+|         | below).                                                             |
+---------+---------------------------------------------------------------------+
+| ``'0'`` | The conversion will be zero padded for numeric values.              |
+---------+---------------------------------------------------------------------+
+| ``'-'`` | The converted value is left adjusted (overrides the ``'0'``         |
+|         | conversion if both are given).                                      |
+---------+---------------------------------------------------------------------+
+| ``' '`` | (a space) A blank should be left before a positive number (or empty |
+|         | string) produced by a signed conversion.                            |
+---------+---------------------------------------------------------------------+
+| ``'+'`` | A sign character (``'+'`` or ``'-'``) will precede the conversion   |
+|         | (overrides a "space" flag).                                         |
+---------+---------------------------------------------------------------------+
+
+A length modifier (``h``, ``l``, or ``L``) may be present, but is ignored as it
+is not necessary for Python -- so e.g. ``%ld`` is identical to ``%d``.
+
+The conversion types are:
+
+------------+-----------------------------------------------------+-------+
+| Conversion | Meaning                                             | Notes |
+============+=====================================================+=======+
+| ``'d'``    | Signed integer decimal.                             |       |
+------------+-----------------------------------------------------+-------+
+| ``'i'``    | Signed integer decimal.                             |       |
+------------+-----------------------------------------------------+-------+
+| ``'o'``    | Signed octal value.                                 | \(1)  |
+------------+-----------------------------------------------------+-------+
+| ``'u'``    | Obsolete type -- it is identical to ``'d'``.        | \(7)  |
+------------+-----------------------------------------------------+-------+
+| ``'x'``    | Signed hexadecimal (lowercase).                     | \(2)  |
+------------+-----------------------------------------------------+-------+
+| ``'X'``    | Signed hexadecimal (uppercase).                     | \(2)  |
+------------+-----------------------------------------------------+-------+
+| ``'e'``    | Floating point exponential format (lowercase).      | \(3)  |
+------------+-----------------------------------------------------+-------+
+| ``'E'``    | Floating point exponential format (uppercase).      | \(3)  |
+------------+-----------------------------------------------------+-------+
+| ``'f'``    | Floating point decimal format.                      | \(3)  |
+------------+-----------------------------------------------------+-------+
+| ``'F'``    | Floating point decimal format.                      | \(3)  |
+------------+-----------------------------------------------------+-------+
+| ``'g'``    | Floating point format. Uses lowercase exponential   | \(4)  |
+|            | format if exponent is less than -4 or not less than |       |
+|            | precision, decimal format otherwise.                |       |
+------------+-----------------------------------------------------+-------+
+| ``'G'``    | Floating point format. Uses uppercase exponential   | \(4)  |
+|            | format if exponent is less than -4 or not less than |       |
+|            | precision, decimal format otherwise.                |       |
+------------+-----------------------------------------------------+-------+
+| ``'c'``    | Single byte (accepts integer or single              |       |
+|            | byte objects).                                      |       |
+------------+-----------------------------------------------------+-------+
+| ``'b'``    | Bytes (any object that follows the                  | \(5)  |
+|            | :ref:`buffer protocol <bufferobjects>` or has       |       |
+|            | :meth:`__bytes__`).                                 |       |
+------------+-----------------------------------------------------+-------+
+| ``'s'``    | ``'s'`` is an alias for ``'b'`` and should only     | \(6)  |
+|            | be used for Python2/3 code bases.                   |       |
+------------+-----------------------------------------------------+-------+
+| ``'a'``    | Bytes (converts any Python object using             | \(5)  |
+|            | ``repr(obj).encode('ascii','backslashreplace)``).   |       |
+------------+-----------------------------------------------------+-------+
+| ``'%'``    | No argument is converted, results in a ``'%'``      |       |
+|            | character in the result.                            |       |
+------------+-----------------------------------------------------+-------+
+
+Notes:
+
+(1)
+   The alternate form causes a leading zero (``'0'``) to be inserted between
+   left-hand padding and the formatting of the number if the leading character
+   of the result is not already a zero.
+
+(2)
+   The alternate form causes a leading ``'0x'`` or ``'0X'`` (depending on whether
+   the ``'x'`` or ``'X'`` format was used) to be inserted between left-hand padding
+   and the formatting of the number if the leading character of the result is not
+   already a zero.
+
+(3)
+   The alternate form causes the result to always contain a decimal point, even if
+   no digits follow it.
+
+   The precision determines the number of digits after the decimal point and
+   defaults to 6.
+
+(4)
+   The alternate form causes the result to always contain a decimal point, and
+   trailing zeroes are not removed as they would otherwise be.
+
+   The precision determines the number of significant digits before and after the
+   decimal point and defaults to 6.
+
+(5)
+   If precision is ``N``, the output is truncated to ``N`` characters.
+
+(6)
+   ``b'%s'`` is deprecated, but will not be removed during the 3.x series.
+
+(7)
+   See :pep:`237`.
+
+.. note::
+
+   The bytearray version of this method does *not* operate in place - it
+   always produces a new object, even if no changes were made.
+
+.. seealso:: :pep:`461`.
+.. versionadded:: 3.5
+
 .. _typememoryview:

 Memory Views

--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@@ -62,6 +62,7 @@ PyAPI_FUNC(void) PyBytes_Concat(PyObject **, PyObject *);
 PyAPI_FUNC(void) PyBytes_ConcatAndDel(PyObject **, PyObject *);
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
+PyAPI_FUNC(PyObject *) _PyBytes_Format(PyObject *, PyObject *);
 #endif
 PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
 						   const char *, Py_ssize_t,

--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -2245,6 +2245,8 @@ PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
    Py_UNICODE c
    );

+PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
+
 /* Create a copy of a unicode string ending with a nul character. Return NULL
   and raise a MemoryError exception on memory allocation failure, otherwise
   return a new allocated buffer (use PyMem_Free() to free the buffer). */

--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -461,6 +461,28 @@ class BaseBytesTest:
        self.assertEqual(b.rindex(i, 3, 9), 7)
        self.assertRaises(ValueError, b.rindex, w, 1, 3)

+    def test_mod(self):
+        b = b'hello, %b!'
+        orig = b
+        b = b % b'world'
+        self.assertEqual(b, b'hello, world!')
+        self.assertEqual(orig, b'hello, %b!')
+        self.assertFalse(b is orig)
+        b = b'%s / 100 = %d%%'
+        a = b % (b'seventy-nine', 79)
+        self.assertEquals(a, b'seventy-nine / 100 = 79%')
+
+    def test_imod(self):
+        b = b'hello, %b!'
+        orig = b
+        b %= b'world'
+        self.assertEqual(b, b'hello, world!')
+        self.assertEqual(orig, b'hello, %b!')
+        self.assertFalse(b is orig)
+        b = b'%s / 100 = %d%%'
+        b %= (b'seventy-nine', 79)
+        self.assertEquals(b, b'seventy-nine / 100 = 79%')
+
    def test_replace(self):
        b = self.type2test(b'mississippi')
        self.assertEqual(b.replace(b'i', b'a'), b'massassappa')
@@ -990,6 +1012,28 @@ class ByteArrayTest(BaseBytesTest, unittest.TestCase):
        b[8:] = b
        self.assertEqual(b, bytearray(list(range(8)) + list(range(256))))

+    def test_mod(self):
+        b = bytearray(b'hello, %b!')
+        orig = b
+        b = b % b'world'
+        self.assertEqual(b, b'hello, world!')
+        self.assertEqual(orig, bytearray(b'hello, %b!'))
+        self.assertFalse(b is orig)
+        b = bytearray(b'%s / 100 = %d%%')
+        a = b % (b'seventy-nine', 79)
+        self.assertEquals(a, bytearray(b'seventy-nine / 100 = 79%'))
+
+    def test_imod(self):
+        b = bytearray(b'hello, %b!')
+        orig = b
+        b %= b'world'
+        self.assertEqual(b, b'hello, world!')
+        self.assertEqual(orig, bytearray(b'hello, %b!'))
+        self.assertFalse(b is orig)
+        b = bytearray(b'%s / 100 = %d%%')
+        b %= (b'seventy-nine', 79)
+        self.assertEquals(b, bytearray(b'seventy-nine / 100 = 79%'))
+
    def test_iconcat(self):
        b = bytearray(b"abc")
        b1 = b

--- a/Lib/test/test_format.py
+++ b/Lib/test/test_format.py
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -14,6 +14,9 @@ Core and Builtins
  atomic memory access if available. Patch written by Vitor de Lima and Gustavo
  Temple.

+- Issue #20284: %-interpolation (aka printf) formatting added for bytes and
+  bytearray.
+
 - Issue #23048: Fix jumping out of an infinite while loop in the pdb.

 - Issue #20335: bytes constructor now raises TypeError when encoding or errors

--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -686,8 +686,9 @@ PyObject_Format(PyObject *obj, PyObject *format_spec)
    Py_DECREF(meth);

    if (result && !PyUnicode_Check(result)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "__format__ method did not return string");
+        PyErr_Format(PyExc_TypeError,
+             "__format__ must return a str, not %.200s",
+             Py_TYPE(result)->tp_name);
        Py_DECREF(result);
        result = NULL;
        goto done;

--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -4,6 +4,7 @@
 #include "Python.h"
 #include "structmember.h"
 #include "bytes_methods.h"
+#include "bytesobject.h"

 /*[clinic input]
 class bytearray "PyByteArrayObject *" "&PyByteArray_Type"
@@ -294,6 +295,31 @@ PyByteArray_Concat(PyObject *a, PyObject *b)
    return (PyObject *)result;
 }

+static PyObject *
+bytearray_format(PyByteArrayObject *self, PyObject *args)
+{
+    PyObject *bytes_in, *bytes_out, *res;
+    char *bytestring;
+
+    if (self == NULL || !PyByteArray_Check(self) || args == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+    bytestring = PyByteArray_AS_STRING(self);
+    bytes_in = PyBytes_FromString(bytestring);
+    if (bytes_in == NULL)
+        return NULL;
+    bytes_out = _PyBytes_Format(bytes_in, args);
+    Py_DECREF(bytes_in);
+    if (bytes_out == NULL)
+        return NULL;
+    res = PyByteArray_FromObject(bytes_out);
+    Py_DECREF(bytes_out);
+    if (res == NULL)
+        return NULL;
+    return res;
+}
+
 /* Functions stuffed into the type object */

 static Py_ssize_t
@@ -3723,6 +3749,21 @@ bytearray_methods[] = {
    {NULL}
 };

+static PyObject *
+bytearray_mod(PyObject *v, PyObject *w)
+{
+    if (!PyByteArray_Check(v))
+        Py_RETURN_NOTIMPLEMENTED;
+    return bytearray_format((PyByteArrayObject *)v, w);
+}
+
+static PyNumberMethods bytearray_as_number = {
+    0,              /*nb_add*/
+    0,              /*nb_subtract*/
+    0,              /*nb_multiply*/
+    bytearray_mod,  /*nb_remainder*/
+};
+
 PyDoc_STRVAR(bytearray_doc,
 "bytearray(iterable_of_ints) -> bytearray\n\
 bytearray(string, encoding[, errors]) -> bytearray\n\
@@ -3751,7 +3792,7 @@ PyTypeObject PyByteArray_Type = {
    0,                                  /* tp_setattr */
    0,                                  /* tp_reserved */
    (reprfunc)bytearray_repr,           /* tp_repr */
-    0,                                  /* tp_as_number */
+    &bytearray_as_number,               /* tp_as_number */
    &bytearray_as_sequence,             /* tp_as_sequence */
    &bytearray_as_mapping,              /* tp_as_mapping */
    0,                                  /* tp_hash */

--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -13893,8 +13893,8 @@ formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
 * CAUTION:  o, x and X conversions on regular ints can never
 * produce a '-' sign, but can for Python's unbounded ints.
 */
-static PyObject*
-formatlong(PyObject *val, struct unicode_format_arg_t *arg)
+PyObject *
+_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
 {
    PyObject *result = NULL;
    char *buf;
@@ -13904,8 +13904,6 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
    Py_ssize_t llen;
    int numdigits;      /* len == numnondigits + numdigits */
    int numnondigits = 0;
-    int prec = arg->prec;
-    int type = arg->ch;

    /* Avoid exceeding SSIZE_T_MAX */
    if (prec > INT_MAX-3) {
@@ -13954,7 +13952,7 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
    if (llen > INT_MAX) {
        Py_DECREF(result);
        PyErr_SetString(PyExc_ValueError,
-                        "string too large in _PyBytes_FormatLong");
+                        "string too large in _PyUnicode_FormatLong");
        return NULL;
    }
    len = (int)llen;
@@ -13964,7 +13962,7 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
    assert(numdigits > 0);

    /* Get rid of base marker unless F_ALT */
-    if (((arg->flags & F_ALT) == 0 &&
+    if (((alt) == 0 &&
        (type == 'o' || type == 'x' || type == 'X'))) {
        assert(buf[sign] == '0');
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
@@ -14099,7 +14097,7 @@ mainformatlong(PyObject *v,
        return 1;
    }

-    res = formatlong(iobj, arg);
+    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
    Py_DECREF(iobj);
    if (res == NULL)
        return -1;