bigarray: ArrayRef support for BigArray

Rationale --------- Array reference could be useful in situations where one needs to pass arrays between processes and instead of copying array data, leverage the fact that top-level array, for example ZBigArray, is already persisted separately, and only send small amount of information referencing data in question. Implementation -------------- BigArray is not regular NumPy array and so needs explicit support in ArrayRef code to find root object and indices. This patch adds such support via the following way: - when BigArray.__getitem__ creates VMA, it remembers in the VMA the top-level BigArray object under which this VMA was created. - when ArrayRef is finding root, it can detect such VMAs, because it will be pointed to by the most top regular ndarray's .base, and in turn gets top-level BigArray object from the VMA. - further all indices computations are performed, similarly to complete regular ndarrays case, on ndarrays root and a. But in the end .lo and .hi are adjusted for the corresponding offset of where root is inside whole BigArray. - there is no need to adjust .deref() at all. For remembering information into a VMA and also to be able to get (readonly) its mapping addresses _bigfile.c extension has to be extended a bit. Since we are now storing arbitrary python object attached to PyVMA - it can create cycles - and so PyVMA accordingly adjusted to support cyclic garbage collector. Please see the patch itself for more details and comments.

bigarray: ArrayRef support for BigArray
Rationale --------- Array reference could be useful in situations where one needs to pass arrays between processes and instead of copying array data, leverage the fact that top-level array, for example ZBigArray, is already persisted separately, and only send small amount of information referencing data in question. Implementation -------------- BigArray is not regular NumPy array and so needs explicit support in ArrayRef code to find root object and indices. This patch adds such support via the following way: - when BigArray.__getitem__ creates VMA, it remembers in the VMA the top-level BigArray object under which this VMA was created. - when ArrayRef is finding root, it can detect such VMAs, because it will be pointed to by the most top regular ndarray's .base, and in turn gets top-level BigArray object from the VMA. - further all indices computations are performed, similarly to complete regular ndarrays case, on ndarrays root and a. But in the end .lo and .hi are adjusted for the corresponding offset of where root is inside whole BigArray. - there is no need to adjust .deref() at all. For remembering information into a VMA and also to be able to get (readonly) its mapping addresses _bigfile.c extension has to be extended a bit. Since we are now storing arbitrary python object attached to PyVMA - it can create cycles - and so PyVMA accordingly adjusted to support cyclic garbage collector. Please see the patch itself for more details and comments.
450ad804 · Kirill Smelkov · d53371b6 · 450ad804 · 450ad804 · 450ad804
Commit 450ad804 authored Apr 02, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 168 additions and 11 deletions

bigarray/__init__.py bigarray/__init__.py +53 -2

bigarray/tests/test_basic.py bigarray/tests/test_basic.py +15 -2

bigfile/_bigfile.c bigfile/_bigfile.c +100 -7

No files found.
--- a/bigarray/__init__.py
+++ b/bigarray/__init__.py
@@ -410,6 +410,10 @@ class BigArray(object):
            # ~~~ mmap file part corresponding to full major slice into memory
            vmaM = self._fileh.mmap(pageM_min, pageM_max-pageM_min+1)

+            # remember to which BigArray this vma belongs.
+            # this is needed for ArrayRef to be able to find root array object.
+            vmaM.pyuser = self
+

            # first get ndarray view with only major slice specified and rest indices being ":"
            viewM_shape   = Mreplace(self._shape, nitemsM)
@@ -500,9 +504,29 @@ def _flatbytev(a):
 # The reference is represented by root array object and instructions how to
 # create original array as some view of the root.
 #
+# Such reference could be useful in situations where one needs to pass arrays
+# between processes and instead of copying array data, leverage the fact that
+# top-level array, for example ZBigArray, is already persisted separately, and
+# only send small amount of information referencing data in question.
+#
 # Use ArrayRef(array) to create reference to an ndarray.
 #
 # Use .deref() to convert ArrayRef to pointed array object.
+#
+# NOTE
+#
+# don't send ArrayRef unconditionally - for example when array object is
+# small regular ndarray with also regular, but big, root ndarray, sending
+# ArrayRef will send whole data for root object, not for small leaf.
+#
+# Sending ArrayRef only makes sense when root object is known to be already
+# persisted by other means, for example something like below in ZODB context:
+#
+#   aref = ArrayRef(a)
+#   if isinstance(aref.root, Persistent):
+#       send aref
+#   else:
+#       send a
 class ArrayRef(object):
    # .root         top-level array object
    #
@@ -629,6 +653,7 @@ class ArrayRef(object):
    def __init__(aref, a):
        # find root
        root = a            # top-level ndarray
+        bigvma = None       # VMA, that is root.base, if there is one
        while 1:
            base = root.base

@@ -650,7 +675,9 @@ class ArrayRef(object):

            # base is neither ndarray (sub)class nor ndarray proxy.
            #
-            # it is top-level ndarray with base taken from an object
+            # either it is
+            #
+            # 1) top-level ndarray with base taken from an object
            # with buffer interface, e.g. as here:
            #
            #   In [1]: s = '123'
@@ -660,7 +687,14 @@ class ArrayRef(object):
            #   In [4]: x.base
            #   Out[4]: '123'
            #
-            # and so it should be treated as top-level ndarray.
+            # and so it should be treated as top-level ndarray,
+            #
+            # 2) or it is a VMA created from under BigArray which will be
+            # treated as top-level too, and corrected for in the end.
+            basetype = type(base)
+            if basetype.__module__ + "." + basetype.__name__ == "_bigfile.VMA":
+            #if isinstance(base, _bigfile.VMA):  XXX _bigfile does not expose VMA
+                bigvma = base
            break


@@ -716,5 +750,22 @@ class ArrayRef(object):
        aref.dtype   = a.dtype
        aref.atype   = type(a)

+        # correct it, if the root is actually BigArray
+        if bigvma is not None:
+            assert bigvma.addr_start <= rdata[0]
+            assert rdata[0] + len(broot) <= bigvma.addr_stop
+
+            bigroot = bigvma.pyuser
+            assert isinstance(bigroot, BigArray)
+
+            # bigoff is broot position in bbigroot (both raw flat []byte ↑ along memory)
+            pgoff, _ = bigvma.filerange()
+            bigoff = pgoff * bigvma.pagesize()          # vma start offset
+            bigoff += rdata[0] - bigvma.addr_start      # broot offset from vma start
+
+            aref.root  = bigroot
+            aref.lo   += bigoff
+            aref.hi   += bigoff
+
        # we are done
        return
--- a/bigarray/tests/test_basic.py
+++ b/bigarray/tests/test_basic.py
@@ -679,7 +679,17 @@ def test_arrayref():
    assert array_equal(ref.deref(), a)


-    for root in (data, rdata):
+    # BigArray with data backend.
+    # data_ is the same as data but shifted to exercise vma and vma->broot offsets calculation.
+    data_ = zeros(8*PS, dtype=uint8)
+    data_[2*PS-1:][:PS] = data
+    f  = BigFile_Data_RO(data_, PS)
+    fh = f.fileh_open()
+    A  = BigArray(data_.shape, data_.dtype, fh)
+    assert array_equal(A[2*PS-1:][:PS], data)
+
+
+    for root in (data, rdata, A):  # both ndarray and BigArray roots
        # refok verifies whether ArrayRef(x) works ok
        def refok(x):
            ref = ArrayRef(x)
@@ -702,7 +712,10 @@ def test_arrayref():
            assert array_equal(ref.deref(), x)


-        if root is rdata:
+        if isinstance(root, BigArray):
+            a = root[2*PS-1:][:PS]      # get to `data` range
+        # typeof(root) = ndarray
+        elif root is rdata:
            a = root[::-1]              # rdata
        else:
            a = root[:]                 # data

--- a/bigfile/_bigfile.c
+++ b/bigfile/_bigfile.c
 /* Wendelin.bigfile | Python interface to memory/files
- * Copyright (C) 2014-2015  Nexedi SA and Contributors.
+ * Copyright (C) 2014-2018  Nexedi SA and Contributors.
 *                          Kirill Smelkov <kirr@nexedi.com>
 *
 * This program is free software: you can Use, Study, Modify and Redistribute
@@ -59,12 +59,36 @@ static PyObject *pybuf_str;

 /*
 * python representation of VMA - exposes vma memory as python buffer
+ *
+ * also exposes:
+ *
+ *      .filerange()            to know which range in mmaped file this vma covers.
+ *      .pagesize()             to know page size of underlying RAM.
+ *
+ * and:
+ *
+ *      .addr_start, .addr_stop to know offset of ndarray in VMA.
+ *      .pyuser                 generic python-level attribute (see below).
 */
 struct PyVMA {
    PyObject;
    PyObject *in_weakreflist;

    VMA;
+
+    /* python-level user of this VMA.
+     *
+     * for example for ArrayRef to work, BigArray needs to find out VMA ->
+     * top-level BigArray object for which this VMA was created.
+     *
+     * There is vma -> fileh -> file chain, but e.g. for a given ZBigFile there
+     * can be several ZBigArrays created on top of it to view its data (e.g. via
+     * BigArray.view()). So even if it can go from vma to -> zfile it does not
+     * help to find out the top-level ZBigArray object itself.
+     *
+     * This way we allow BigArray python code to set vma.pyuser attribute
+     * pointing to original BigArray object for which this VMA was created. */
+    PyObject *pyuser;
 };
 typedef struct PyVMA PyVMA;

@@ -140,6 +164,11 @@ void XPyBufferObject_Unpin(PyBufferObject *bufo);
 void XPyBuffer_Unpin(Py_buffer *view);


+#define PyFunc(FUNC, DOC)               \
+static const char FUNC ##_doc[] = DOC;  \
+static PyObject *FUNC
+
+
 /************
 *  PyVMA   *
 ************/
@@ -193,6 +222,50 @@ pyvma_len(PyObject *pyvma0)
 }


+/* pyvma vs cyclic GC */
+static int
+pyvma_traverse(PyObject *pyvma0, visitproc visit, void *arg)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+
+    Py_VISIT(pyvma->pyuser);
+    return 0;
+}
+
+static int
+pyvma_clear(PyObject *pyvma0)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+
+    Py_CLEAR(pyvma->pyuser);
+    return 0;
+}
+
+
+PyFunc(pyvma_filerange, "filerange() -> (pgoffset, pglen) -- file range this vma covers")
+    (PyObject *pyvma0, PyObject *args)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+    Py_ssize_t pgoffset, pglen;     // XXX Py_ssize_t vs pgoff_t
+
+    pgoffset = pyvma->f_pgoffset;
+    pglen    = (pyvma->addr_stop - pyvma->addr_start) / pyvma->fileh->ramh->ram->pagesize;
+    /* NOTE ^^^ addr_stop and addr_start must be page-aligned */
+
+    return Py_BuildValue("(nn)", pgoffset, pglen);
+}
+
+
+PyFunc(pyvma_pagesize, "pagesize() -> pagesize -- page size of RAM underlying this VMA")
+    (PyObject *pyvma0, PyObject *args)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+    Py_ssize_t pagesize = pyvma->fileh->ramh->ram->pagesize;
+
+    return Py_BuildValue("n", pagesize);
+}
+
+
 static void
 pyvma_dealloc(PyObject *pyvma0)
 {
@@ -210,6 +283,7 @@ pyvma_dealloc(PyObject *pyvma0)
        Py_DECREF(pyfileh);
    }

+    pyvma_clear(pyvma);
    pyvma->ob_type->tp_free(pyvma);
 }

@@ -247,12 +321,35 @@ static /*const*/ PySequenceMethods pyvma_as_seq = {
 };


+static /*const*/ PyMethodDef pyvma_methods[] = {
+    {"filerange",   pyvma_filerange,    METH_VARARGS,   pyvma_filerange_doc},
+    {"pagesize",    pyvma_pagesize,     METH_VARARGS,   pyvma_pagesize_doc},
+    {NULL}
+};
+
+// XXX vvv better switch on various possibilities and find approptiate type
+// (e.g. on X32 uintptr_t will be 4 while long will be 8)
+const int _ =
+    BUILD_ASSERT_OR_ZERO(sizeof(uintptr_t) == sizeof(unsigned long));
+#define T_UINTPTR   T_ULONG
+
+static /*const*/ PyMemberDef pyvma_members[] = {
+    {"addr_start",  T_UINTPTR,      offsetof(PyVMA, addr_start),  READONLY, "vma's start addr"},
+    {"addr_stop",   T_UINTPTR,      offsetof(PyVMA, addr_stop),   READONLY, "vma's start addr"},
+    // XXX pyuser: restrict to read-only access?
+    {"pyuser",      T_OBJECT_EX,    offsetof(PyVMA, pyuser),      0,        "user of this vma"},
+    {NULL}
+};
+
 static PyTypeObject PyVMA_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name            = "_bigfile.VMA",
    .tp_basicsize       = sizeof(PyVMA),
-    .tp_flags           = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_NEWBUFFER,
-    .tp_methods         = NULL, // TODO ?
+    .tp_flags           = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_NEWBUFFER | Py_TPFLAGS_HAVE_GC,
+    .tp_traverse        = pyvma_traverse,
+    .tp_clear           = pyvma_clear,
+    .tp_methods         = pyvma_methods,
+    .tp_members         = pyvma_members,
    .tp_as_sequence     = &pyvma_as_seq,
    .tp_as_buffer       = &pyvma_as_buffer,
    .tp_dealloc         = pyvma_dealloc,
@@ -268,10 +365,6 @@ static PyTypeObject PyVMA_Type = {
 ****************/


-#define PyFunc(FUNC, DOC)               \
-static const char FUNC ##_doc[] = DOC;  \
-static PyObject *FUNC
-
 PyFunc(pyfileh_mmap, "mmap(pgoffset, pglen) - map fileh part into memory")
    (PyObject *pyfileh0, PyObject *args)
 {