Issue #12778: Reduce memory consumption when JSON-encoding a large container of many small objects.

df7fc9dd · Antoine Pitrou · 9bb6dfe0 · df7fc9dd · df7fc9dd · df7fc9dd
Commit df7fc9dd authored Aug 19, 2011 by Antoine Pitrou
Hide whitespace changes
Inline Side-by-side

Showing with 175 additions and 50 deletions

Lib/test/json_tests/test_dump.py Lib/test/json_tests/test_dump.py +18 -1

Misc/NEWS Misc/NEWS +3 -0

Modules/_json.c Modules/_json.c +154 -49

No files found.
--- a/Lib/test/json_tests/test_dump.py
+++ b/Lib/test/json_tests/test_dump.py
 from io import StringIO
 from test.json_tests import PyTest, CTest

+from test.support import precisionbigmemtest, _1G

 class TestDump:
    def test_dump(self):
@@ -21,4 +22,20 @@ class TestDump:


 class TestPyDump(TestDump, PyTest): pass
-class TestCDump(TestDump, CTest): pass
+
+class TestCDump(TestDump, CTest):
+
+    # The size requirement here is hopefully over-estimated (actual
+    # memory consumption depending on implementation details, and also
+    # system memory management, since this may allocate a lot of
+    # small objects).
+
+    @precisionbigmemtest(size=_1G, memuse=1)
+    def test_large_list(self, size):
+        N = int(30 * 1024 * 1024 * (size / _1G))
+        l = [1] * N
+        encoded = self.dumps(l)
+        self.assertEqual(len(encoded), N * 3)
+        self.assertEqual(encoded[:1], "[")
+        self.assertEqual(encoded[-2:], "1]")
+        self.assertEqual(encoded[1:-2], "1, " * (N - 1))
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -262,6 +262,9 @@ Core and Builtins
 Library
 -------

+- Issue #12778: Reduce memory consumption when JSON-encoding a large
+  container of many small objects.
+
 - Issue #12650: Fix a race condition where a subprocess.Popen could leak
  resources (FD/zombie) when killed at the wrong time.


--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -75,6 +75,121 @@ static PyMemberDef encoder_members[] = {
    {NULL}
 };

+/*
+ * A two-level accumulator of unicode objects that avoids both the overhead
+ * of keeping a huge number of small separate objects, and the quadratic
+ * behaviour of using a naive repeated concatenation scheme.
+ */
+
+typedef struct {
+    PyObject *large;  /* A list of previously accumulated large strings */
+    PyObject *small;  /* Pending small strings */
+} accumulator;
+
+static PyObject *
+join_list_unicode(PyObject *lst)
+{
+    /* return u''.join(lst) */
+    static PyObject *sep = NULL;
+    if (sep == NULL) {
+        sep = PyUnicode_FromStringAndSize("", 0);
+        if (sep == NULL)
+            return NULL;
+    }
+    return PyUnicode_Join(sep, lst);
+}
+
+static int
+init_accumulator(accumulator *acc)
+{
+    acc->large = PyList_New(0);
+    if (acc->large == NULL)
+        return -1;
+    acc->small = PyList_New(0);
+    if (acc->small == NULL) {
+        Py_CLEAR(acc->large);
+        return -1;
+    }
+    return 0;
+}
+
+static int
+flush_accumulator(accumulator *acc)
+{
+    Py_ssize_t nsmall = PyList_GET_SIZE(acc->small);
+    if (nsmall) {
+        int ret;
+        PyObject *joined = join_list_unicode(acc->small);
+        if (joined == NULL)
+            return -1;
+        if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
+            Py_DECREF(joined);
+            return -1;
+        }
+        ret = PyList_Append(acc->large, joined);
+        Py_DECREF(joined);
+        return ret;
+    }
+    return 0;
+}
+
+static int
+accumulate_unicode(accumulator *acc, PyObject *obj)
+{
+    int ret;
+    Py_ssize_t nsmall;
+    assert(PyUnicode_Check(obj));
+
+    if (PyList_Append(acc->small, obj))
+        return -1;
+    nsmall = PyList_GET_SIZE(acc->small);
+    /* Each item in a list of unicode objects has an overhead (in 64-bit
+     * builds) of:
+     *   - 8 bytes for the list slot
+     *   - 56 bytes for the header of the unicode object
+     * that is, 64 bytes.  100000 such objects waste more than 6MB
+     * compared to a single concatenated string.
+     */
+    if (nsmall < 100000)
+        return 0;
+    PyObject *joined = join_list_unicode(acc->small);
+    if (joined == NULL)
+        return -1;
+    if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
+        Py_DECREF(joined);
+        return -1;
+    }
+    ret = PyList_Append(acc->large, joined);
+    Py_DECREF(joined);
+    return ret;
+}
+
+static PyObject *
+finish_accumulator(accumulator *acc)
+{
+    int ret;
+    PyObject *res;
+
+    ret = flush_accumulator(acc);
+    Py_CLEAR(acc->small);
+    if (ret) {
+        Py_CLEAR(acc->large);
+        return NULL;
+    }
+    res = acc->large;
+    acc->large = NULL;
+    return res;
+}
+
+static void
+destroy_accumulator(accumulator *acc)
+{
+    Py_CLEAR(acc->small);
+    Py_CLEAR(acc->large);
+}
+
+/* Forward decls */
+
 static PyObject *
 ascii_escape_unicode(PyObject *pystr);
 static PyObject *
@@ -101,11 +216,11 @@ encoder_dealloc(PyObject *self);
 static int
 encoder_clear(PyObject *self);
 static int
-encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level);
+encoder_listencode_list(PyEncoderObject *s, accumulator *acc, PyObject *seq, Py_ssize_t indent_level);
 static int
-encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level);
+encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, PyObject *obj, Py_ssize_t indent_level);
 static int
-encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level);
+encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, PyObject *dct, Py_ssize_t indent_level);
 static PyObject *
 _encoded_const(PyObject *obj);
 static void
@@ -266,19 +381,6 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
    }
 }

-static PyObject *
-join_list_unicode(PyObject *lst)
-{
-    /* return u''.join(lst) */
-    static PyObject *sep = NULL;
-    if (sep == NULL) {
-        sep = PyUnicode_FromStringAndSize("", 0);
-        if (sep == NULL)
-            return NULL;
-    }
-    return PyUnicode_Join(sep, lst);
-}
-
 static PyObject *
 _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
    /* return (rval, idx) tuple, stealing reference to rval */
@@ -1226,22 +1328,22 @@ encoder_call(PyObject *self, PyObject *args, PyObject *kwds)
    /* Python callable interface to encode_listencode_obj */
    static char *kwlist[] = {"obj", "_current_indent_level", NULL};
    PyObject *obj;
-    PyObject *rval;
    Py_ssize_t indent_level;
    PyEncoderObject *s;
+    accumulator acc;
+
    assert(PyEncoder_Check(self));
    s = (PyEncoderObject *)self;
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist,
        &obj, _convertPyInt_AsSsize_t, &indent_level))
        return NULL;
-    rval = PyList_New(0);
-    if (rval == NULL)
+    if (init_accumulator(&acc))
        return NULL;
-    if (encoder_listencode_obj(s, rval, obj, indent_level)) {
-        Py_DECREF(rval);
+    if (encoder_listencode_obj(s, &acc, obj, indent_level)) {
+        destroy_accumulator(&acc);
        return NULL;
    }
-    return rval;
+    return finish_accumulator(&acc);
 }

 static PyObject *
@@ -1313,18 +1415,19 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj)
 }

 static int
-_steal_list_append(PyObject *lst, PyObject *stolen)
+_steal_accumulate(accumulator *acc, PyObject *stolen)
 {
    /* Append stolen and then decrement its reference count */
-    int rval = PyList_Append(lst, stolen);
+    int rval = accumulate_unicode(acc, stolen);
    Py_DECREF(stolen);
    return rval;
 }

 static int
-encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level)
+encoder_listencode_obj(PyEncoderObject *s, accumulator *acc,
+                       PyObject *obj, Py_ssize_t indent_level)
 {
-    /* Encode Python object obj to a JSON term, rval is a PyList */
+    /* Encode Python object obj to a JSON term */
    PyObject *newobj;
    int rv;

@@ -1332,38 +1435,38 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi
        PyObject *cstr = _encoded_const(obj);
        if (cstr == NULL)
            return -1;
-        return _steal_list_append(rval, cstr);
+        return _steal_accumulate(acc, cstr);
    }
    else if (PyUnicode_Check(obj))
    {
        PyObject *encoded = encoder_encode_string(s, obj);
        if (encoded == NULL)
            return -1;
-        return _steal_list_append(rval, encoded);
+        return _steal_accumulate(acc, encoded);
    }
    else if (PyLong_Check(obj)) {
        PyObject *encoded = PyObject_Str(obj);
        if (encoded == NULL)
            return -1;
-        return _steal_list_append(rval, encoded);
+        return _steal_accumulate(acc, encoded);
    }
    else if (PyFloat_Check(obj)) {
        PyObject *encoded = encoder_encode_float(s, obj);
        if (encoded == NULL)
            return -1;
-        return _steal_list_append(rval, encoded);
+        return _steal_accumulate(acc, encoded);
    }
    else if (PyList_Check(obj) || PyTuple_Check(obj)) {
        if (Py_EnterRecursiveCall(" while encoding a JSON object"))
            return -1;
-        rv = encoder_listencode_list(s, rval, obj, indent_level);
+        rv = encoder_listencode_list(s, acc, obj, indent_level);
        Py_LeaveRecursiveCall();
        return rv;
    }
    else if (PyDict_Check(obj)) {
        if (Py_EnterRecursiveCall(" while encoding a JSON object"))
            return -1;
-        rv = encoder_listencode_dict(s, rval, obj, indent_level);
+        rv = encoder_listencode_dict(s, acc, obj, indent_level);
        Py_LeaveRecursiveCall();
        return rv;
    }
@@ -1394,7 +1497,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi

        if (Py_EnterRecursiveCall(" while encoding a JSON object"))
            return -1;
-        rv = encoder_listencode_obj(s, rval, newobj, indent_level);
+        rv = encoder_listencode_obj(s, acc, newobj, indent_level);
        Py_LeaveRecursiveCall();

        Py_DECREF(newobj);
@@ -1414,9 +1517,10 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi
 }

 static int
-encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level)
+encoder_listencode_dict(PyEncoderObject *s, accumulator *acc,
+                        PyObject *dct, Py_ssize_t indent_level)
 {
-    /* Encode Python dict dct a JSON term, rval is a PyList */
+    /* Encode Python dict dct a JSON term */
    static PyObject *open_dict = NULL;
    static PyObject *close_dict = NULL;
    static PyObject *empty_dict = NULL;
@@ -1436,7 +1540,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
            return -1;
    }
    if (Py_SIZE(dct) == 0)
-        return PyList_Append(rval, empty_dict);
+        return accumulate_unicode(acc, empty_dict);

    if (s->markers != Py_None) {
        int has_key;
@@ -1454,7 +1558,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
        }
    }

-    if (PyList_Append(rval, open_dict))
+    if (accumulate_unicode(acc, open_dict))
        goto bail;

    if (s->indent != Py_None) {
@@ -1541,7 +1645,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
        }

        if (idx) {
-            if (PyList_Append(rval, s->item_separator))
+            if (accumulate_unicode(acc, s->item_separator))
                goto bail;
        }

@@ -1549,16 +1653,16 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
        Py_CLEAR(kstr);
        if (encoded == NULL)
            goto bail;
-        if (PyList_Append(rval, encoded)) {
+        if (accumulate_unicode(acc, encoded)) {
            Py_DECREF(encoded);
            goto bail;
        }
        Py_DECREF(encoded);
-        if (PyList_Append(rval, s->key_separator))
+        if (accumulate_unicode(acc, s->key_separator))
            goto bail;

        value = PyTuple_GET_ITEM(item, 1);
-        if (encoder_listencode_obj(s, rval, value, indent_level))
+        if (encoder_listencode_obj(s, acc, value, indent_level))
            goto bail;
        idx += 1;
        Py_DECREF(item);
@@ -1578,7 +1682,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss

        yield '\n' + (' ' * (_indent * _current_indent_level))
    }*/
-    if (PyList_Append(rval, close_dict))
+    if (accumulate_unicode(acc, close_dict))
        goto bail;
    return 0;

@@ -1592,9 +1696,10 @@ bail:


 static int
-encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level)
+encoder_listencode_list(PyEncoderObject *s, accumulator *acc,
+                        PyObject *seq, Py_ssize_t indent_level)
 {
-    /* Encode Python list seq to a JSON term, rval is a PyList */
+    /* Encode Python list seq to a JSON term */
    static PyObject *open_array = NULL;
    static PyObject *close_array = NULL;
    static PyObject *empty_array = NULL;
@@ -1618,7 +1723,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
    num_items = PySequence_Fast_GET_SIZE(s_fast);
    if (num_items == 0) {
        Py_DECREF(s_fast);
-        return PyList_Append(rval, empty_array);
+        return accumulate_unicode(acc, empty_array);
    }

    if (s->markers != Py_None) {
@@ -1638,7 +1743,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
    }

    seq_items = PySequence_Fast_ITEMS(s_fast);
-    if (PyList_Append(rval, open_array))
+    if (accumulate_unicode(acc, open_array))
        goto bail;
    if (s->indent != Py_None) {
        /* TODO: DOES NOT RUN */
@@ -1652,10 +1757,10 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
    for (i = 0; i < num_items; i++) {
        PyObject *obj = seq_items[i];
        if (i) {
-            if (PyList_Append(rval, s->item_separator))
+            if (accumulate_unicode(acc, s->item_separator))
                goto bail;
        }
-        if (encoder_listencode_obj(s, rval, obj, indent_level))
+        if (encoder_listencode_obj(s, acc, obj, indent_level))
            goto bail;
    }
    if (ident != NULL) {
@@ -1670,7 +1775,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss

        yield '\n' + (' ' * (_indent * _current_indent_level))
    }*/
-    if (PyList_Append(rval, close_array))
+    if (accumulate_unicode(acc, close_array))
        goto bail;
    Py_DECREF(s_fast);
    return 0;