bigarray: ArrayRef utility

ArrayRef is a reference to NumPy array. The reference is represented by root array object and instructions how to create original array as some view of the root. Such reference could be useful in situations where one needs to pass arrays between processes and instead of copying array data, leverage the fact that top-level array, for example ZBigArray, is already persisted separately, and only send small amount of information referencing data in question. Use `ArrayRef(array)` to create reference to an ndarray. Use `.deref()` to convert ArrayRef to pointed array object. NOTE don't send ArrayRef unconditionally - for example when array object is small regular ndarray with also regular, but big, root ndarray, sending ArrayRef will send whole data for root object, not for small leaf. Sending ArrayRef only makes sense when root object is known to be already persisted by other means, for example something like below in ZODB context: ```python aref = ArrayRef(a) if isinstance(aref.root, Persistent): send aref else: send a ``` Please see individual patches for more details. /reviewed-on !6

bigarray: ArrayRef utility
ArrayRef is a reference to NumPy array. The reference is represented by root array object and instructions how to create original array as some view of the root. Such reference could be useful in situations where one needs to pass arrays between processes and instead of copying array data, leverage the fact that top-level array, for example ZBigArray, is already persisted separately, and only send small amount of information referencing data in question. Use `ArrayRef(array)` to create reference to an ndarray. Use `.deref()` to convert ArrayRef to pointed array object. NOTE don't send ArrayRef unconditionally - for example when array object is small regular ndarray with also regular, but big, root ndarray, sending ArrayRef will send whole data for root object, not for small leaf. Sending ArrayRef only makes sense when root object is known to be already persisted by other means, for example something like below in ZODB context: ```python aref = ArrayRef(a) if isinstance(aref.root, Persistent): send aref else: send a ``` Please see individual patches for more details. /reviewed-on !6
e9d61a89 · Kirill Smelkov · f785ac07 · 450ad804 · e9d61a89 · e9d61a89
Commit e9d61a89 authored Apr 02, 2018 by Kirill Smelkov
Showing with 641 additions and 13 deletions

bigarray/__init__.py bigarray/__init__.py +298 -2

bigarray/tests/test_basic.py bigarray/tests/test_basic.py +243 -4

bigfile/_bigfile.c bigfile/_bigfile.c +100 -7

No files found.
--- a/bigarray/__init__.py
+++ b/bigarray/__init__.py
--- a/bigarray/tests/test_basic.py
+++ b/bigarray/tests/test_basic.py
+# -*- coding: utf-8 -*-
 # Wendeling.core.bigarray | Basic tests
-# Copyright (C) 2014-2015  Nexedi SA and Contributors.
+# Copyright (C) 2014-2018  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -18,12 +19,13 @@
 # See COPYING file for full licensing terms.
 # See https://www.nexedi.com/licensing for rationale and options.

-from wendelin.bigarray import BigArray
+from wendelin.bigarray import BigArray, ArrayRef, _flatbytev
 from wendelin.bigfile import BigFile
 from wendelin.lib.mem import memcpy
 from wendelin.lib.calc import mul
-from numpy import ndarray, dtype, int64, int32, uint32, uint8, all, zeros, arange, \
-        array_equal, asarray
+from numpy import ndarray, dtype, int64, int32, uint32, int16, uint8, all, zeros, arange, \
+        array_equal, asarray, newaxis, swapaxes
+from numpy.lib.stride_tricks import as_strided
 import numpy

 from pytest import raises
@@ -588,3 +590,240 @@ def test_bigarray_to_ndarray():
    for i in range(48,65):
        C = BigArray(((1<<i)-1,), uint8, Zh)
        raises(MemoryError, 'asarray(C)')
+
+
+
+
+def test_arrayref():
+    # test data - all items are unique - so we can check array by content
+    data = zeros(PS, dtype=uint8)
+    data32 = data.view(uint32)
+    data32[:] = arange(len(data32), dtype=uint32)
+    data[:256] = arange(256, dtype=uint8)   # first starting bytes are all unique
+
+    # regular ndarray without parent at all
+    ref = ArrayRef(data)
+    assert ref.root is data
+    assert ref.lo == 0
+    assert ref.hi == len(data)
+    assert ref.z0 == 0
+    assert ref.shape == data.shape
+    assert ref.stridev == data.strides
+    assert ref.dtype == data.dtype
+    assert array_equal(ref.deref(), data)
+
+    # regular ndarrays with parent
+    ref = ArrayRef(data32)
+    assert ref.root is data
+    assert ref.lo == 0
+    assert ref.hi == len(data)
+    assert ref.z0 == 0
+    assert ref.shape == data32.shape
+    assert ref.stridev == data32.strides
+    assert ref.dtype == data32.dtype
+    assert array_equal(ref.deref(), data32)
+
+    a = data[100:140]
+    ref = ArrayRef(a)
+    assert ref.root is data
+    assert ref.lo == 100
+    assert ref.hi == 140
+    assert ref.z0 == 0
+    assert ref.shape == (40,)
+    assert ref.stridev == (1,)
+    assert ref.dtype == data.dtype
+    assert array_equal(ref.deref(), a)
+
+    a = data[140:100:-1]
+    ref = ArrayRef(a)
+    assert ref.root is data
+    assert ref.lo == 101
+    assert ref.hi == 141
+    assert ref.z0 == 39
+    assert ref.shape == (40,)
+    assert ref.stridev == (-1,)
+    assert ref.dtype == data.dtype
+    assert array_equal(ref.deref(), a)
+
+    a = data[100:140:-1]   # empty
+    ref = ArrayRef(a)
+    assert ref.root is data
+    assert ref.lo == 0
+    assert ref.hi == 1
+    assert ref.z0 == 0
+    assert ref.shape == (0,)
+    assert ref.stridev == (1,)
+    assert ref.dtype == data.dtype
+    assert array_equal(ref.deref(), a)
+
+    # rdata is the same as data[::-1] but without base - i.e. it is toplevel
+    m = memoryview(data[::-1])
+    rdata = asarray(m)
+    assert array_equal(rdata[::-1], data)
+    assert rdata.strides == (-1,)
+    m_ = rdata.base
+    assert isinstance(m_, memoryview)
+    #assert m_ is m  XXX strangely it is another object, not exactly m
+    # XXX however rdata.strides<0 and no rdata.base.base is enough for us here.
+    raises(AttributeError, 'm_.base')
+
+    a = rdata[100:140]
+    ref = ArrayRef(a)
+    assert ref.root is rdata
+    assert ref.lo == PS - 140
+    assert ref.hi == PS - 100
+    assert ref.z0 == 39
+    assert ref.shape == (40,)
+    assert ref.stridev == (-1,)
+    assert ref.dtype == data.dtype
+    assert array_equal(ref.deref(), a)
+
+
+    # BigArray with data backend.
+    # data_ is the same as data but shifted to exercise vma and vma->broot offsets calculation.
+    data_ = zeros(8*PS, dtype=uint8)
+    data_[2*PS-1:][:PS] = data
+    f  = BigFile_Data_RO(data_, PS)
+    fh = f.fileh_open()
+    A  = BigArray(data_.shape, data_.dtype, fh)
+    assert array_equal(A[2*PS-1:][:PS], data)
+
+
+    for root in (data, rdata, A):  # both ndarray and BigArray roots
+        # refok verifies whether ArrayRef(x) works ok
+        def refok(x):
+            ref = ArrayRef(x)
+            assert ref.root is root
+            x_ = ref.deref()
+            assert array_equal(x_, x)
+            assert x_.dtype == x.dtype
+            assert type(x_) == type(x)
+
+            # check that deref won't access range outside lo:hi - by copying
+            # root, setting bytes in adjusted root outside lo:hi to either 0x00
+            # or 0xff and tweaking ref.root = root_.
+            root_ = numpy.copy(_flatbytev(root[:]))
+            root_[:ref.lo] = 0
+            root_[ref.hi:] = 0
+            ref.root = root_
+            assert array_equal(ref.deref(), x)
+            root_[:ref.lo] = 0xff
+            root_[ref.hi:] = 0xff
+            assert array_equal(ref.deref(), x)
+
+
+        if isinstance(root, BigArray):
+            a = root[2*PS-1:][:PS]      # get to `data` range
+        # typeof(root) = ndarray
+        elif root is rdata:
+            a = root[::-1]              # rdata
+        else:
+            a = root[:]                 # data
+        assert array_equal(a, data)
+
+        # subslices that is possible to get by just indexing
+        refok( a[:]         )
+        refok( a[1:2]       )
+        refok( a[1:10]      )
+        refok( a[1:10:2]    )
+        refok( a[1:10:3]    )
+        refok( a[1:10:-1]   )   # empty (.size = 0)
+        refok( a[10:1:-1]   )
+        refok( a[10:1:-2]   )
+        refok( a[10:1:-3]   )
+
+        # long chain root -> a -> a[...] -> a[...] -> leaf
+        l = a[2:118]
+        l = l.view(uint32)[3:20]
+        l = l[1:9]
+        refok(l)
+
+        # not aligned - it is not possible to get to resulting slice just by indexing A
+        refok( a.view(uint8)[2:-2].view(uint32)         )
+        refok( a.view(uint8)[2:-2].view(uint32)[::-1]   )
+
+        refok( a.view(int64)        )   # change of type ↑ in size
+        refok( a.view(int64)[::-1]  )
+        refok( a.view(int16)        )   # change of type ↓ in size
+        refok( a.view(int16)[::-1]  )
+
+        # change of type to size not multiple of original
+        refok( a[1:1+5*10].view('V5')       )   # 4 -> 5
+        refok( a[1:1+5*10].view('V5')[::-1] )
+        refok( a[1:1+3*10].view('V3')       )   # 4 -> 3
+        refok( a[1:1+3*10].view('V3')[::-1] )
+
+        # intermediate parent with <0 stride
+        r = a[1:1+3*10].view('V3')[::-1]
+        refok( r[-2:2:-1]   )
+
+        # 2d array
+        x = a.view(uint32).reshape((8, -1))
+        y = swapaxes(x, 0,1)
+        assert x.shape   == (8, PS//(4*8))
+        assert x.strides == (PS//8, 4)
+        assert y.shape   == (PS//(4*8), 8)
+        assert y.strides == (4, PS//8)
+
+        refok( x )
+        refok( y )
+
+        # array with both >0 and <0 strides
+        x_ = x[:,::-1]
+        y_ = y[:,::-1]
+        assert x_.shape   == x.shape
+        assert x_.strides == (PS//8, -4)
+        assert y_.shape   == y.shape
+        assert y_.strides == (4, -PS//8)
+
+        refok( x_ )
+        refok( y_ )
+
+        # array with [1] dimension
+        z1 = x[:, newaxis, :]
+        assert z1.shape   == (8, 1, PS//(4*8))
+        assert z1.strides == (PS//8, 0, 4)
+
+        refok(z1)
+
+        # array with [0] dimension
+        z0 = z1[:, 0:0, :]
+        assert z0.shape   == (8, 0, PS//(4*8))
+        assert z0.strides == (PS//8, 0, 4)
+
+        refok(z0)
+
+        # tricky array overlapping itself
+        t = a.view(uint32)
+        assert t.shape    == (PS//4,)
+        assert t.strides  == (4,)
+        assert t.itemsize == 4
+        t = as_strided(t, strides=(1,))
+        assert t.shape    == (PS//4,)
+        assert t.strides  == (1,)
+        assert t.itemsize == 4
+
+        refok(t)
+
+        # structured dtype
+        s = a.view(dtype=[('width', '<i2'), ('length', '<i2')])
+        assert s.shape    == (PS//4,)
+        assert s.strides  == (4,)
+        assert s.itemsize == 4
+        refok(s)
+
+        s_ = s['length']
+        assert s_.shape    == (PS//4,)
+        assert s_.strides  == (4,)
+        assert s_.itemsize == 2
+        refok(s_)
+
+
+        # ndarray subclass, e.g. np.recarray
+        r = s.view(type=numpy.recarray)
+        assert isinstance(r, numpy.recarray)
+        assert r.shape    == (PS//4,)
+        assert r.strides  == (4,)
+        assert r.itemsize == 4
+        assert array_equal(r.length, s['length'])
+        refok(r)
--- a/bigfile/_bigfile.c
+++ b/bigfile/_bigfile.c
 /* Wendelin.bigfile | Python interface to memory/files
- * Copyright (C) 2014-2015  Nexedi SA and Contributors.
+ * Copyright (C) 2014-2018  Nexedi SA and Contributors.
 *                          Kirill Smelkov <kirr@nexedi.com>
 *
 * This program is free software: you can Use, Study, Modify and Redistribute
@@ -59,12 +59,36 @@ static PyObject *pybuf_str;

 /*
 * python representation of VMA - exposes vma memory as python buffer
+ *
+ * also exposes:
+ *
+ *      .filerange()            to know which range in mmaped file this vma covers.
+ *      .pagesize()             to know page size of underlying RAM.
+ *
+ * and:
+ *
+ *      .addr_start, .addr_stop to know offset of ndarray in VMA.
+ *      .pyuser                 generic python-level attribute (see below).
 */
 struct PyVMA {
    PyObject;
    PyObject *in_weakreflist;

    VMA;
+
+    /* python-level user of this VMA.
+     *
+     * for example for ArrayRef to work, BigArray needs to find out VMA ->
+     * top-level BigArray object for which this VMA was created.
+     *
+     * There is vma -> fileh -> file chain, but e.g. for a given ZBigFile there
+     * can be several ZBigArrays created on top of it to view its data (e.g. via
+     * BigArray.view()). So even if it can go from vma to -> zfile it does not
+     * help to find out the top-level ZBigArray object itself.
+     *
+     * This way we allow BigArray python code to set vma.pyuser attribute
+     * pointing to original BigArray object for which this VMA was created. */
+    PyObject *pyuser;
 };
 typedef struct PyVMA PyVMA;

@@ -140,6 +164,11 @@ void XPyBufferObject_Unpin(PyBufferObject *bufo);
 void XPyBuffer_Unpin(Py_buffer *view);


+#define PyFunc(FUNC, DOC)               \
+static const char FUNC ##_doc[] = DOC;  \
+static PyObject *FUNC
+
+
 /************
 *  PyVMA   *
 ************/
@@ -193,6 +222,50 @@ pyvma_len(PyObject *pyvma0)
 }


+/* pyvma vs cyclic GC */
+static int
+pyvma_traverse(PyObject *pyvma0, visitproc visit, void *arg)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+
+    Py_VISIT(pyvma->pyuser);
+    return 0;
+}
+
+static int
+pyvma_clear(PyObject *pyvma0)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+
+    Py_CLEAR(pyvma->pyuser);
+    return 0;
+}
+
+
+PyFunc(pyvma_filerange, "filerange() -> (pgoffset, pglen) -- file range this vma covers")
+    (PyObject *pyvma0, PyObject *args)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+    Py_ssize_t pgoffset, pglen;     // XXX Py_ssize_t vs pgoff_t
+
+    pgoffset = pyvma->f_pgoffset;
+    pglen    = (pyvma->addr_stop - pyvma->addr_start) / pyvma->fileh->ramh->ram->pagesize;
+    /* NOTE ^^^ addr_stop and addr_start must be page-aligned */
+
+    return Py_BuildValue("(nn)", pgoffset, pglen);
+}
+
+
+PyFunc(pyvma_pagesize, "pagesize() -> pagesize -- page size of RAM underlying this VMA")
+    (PyObject *pyvma0, PyObject *args)
+{
+    PyVMA *pyvma = upcast(PyVMA *, pyvma0);
+    Py_ssize_t pagesize = pyvma->fileh->ramh->ram->pagesize;
+
+    return Py_BuildValue("n", pagesize);
+}
+
+
 static void
 pyvma_dealloc(PyObject *pyvma0)
 {
@@ -210,6 +283,7 @@ pyvma_dealloc(PyObject *pyvma0)
        Py_DECREF(pyfileh);
    }

+    pyvma_clear(pyvma);
    pyvma->ob_type->tp_free(pyvma);
 }

@@ -247,12 +321,35 @@ static /*const*/ PySequenceMethods pyvma_as_seq = {
 };


+static /*const*/ PyMethodDef pyvma_methods[] = {
+    {"filerange",   pyvma_filerange,    METH_VARARGS,   pyvma_filerange_doc},
+    {"pagesize",    pyvma_pagesize,     METH_VARARGS,   pyvma_pagesize_doc},
+    {NULL}
+};
+
+// XXX vvv better switch on various possibilities and find approptiate type
+// (e.g. on X32 uintptr_t will be 4 while long will be 8)
+const int _ =
+    BUILD_ASSERT_OR_ZERO(sizeof(uintptr_t) == sizeof(unsigned long));
+#define T_UINTPTR   T_ULONG
+
+static /*const*/ PyMemberDef pyvma_members[] = {
+    {"addr_start",  T_UINTPTR,      offsetof(PyVMA, addr_start),  READONLY, "vma's start addr"},
+    {"addr_stop",   T_UINTPTR,      offsetof(PyVMA, addr_stop),   READONLY, "vma's start addr"},
+    // XXX pyuser: restrict to read-only access?
+    {"pyuser",      T_OBJECT_EX,    offsetof(PyVMA, pyuser),      0,        "user of this vma"},
+    {NULL}
+};
+
 static PyTypeObject PyVMA_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name            = "_bigfile.VMA",
    .tp_basicsize       = sizeof(PyVMA),
-    .tp_flags           = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_NEWBUFFER,
-    .tp_methods         = NULL, // TODO ?
+    .tp_flags           = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_NEWBUFFER | Py_TPFLAGS_HAVE_GC,
+    .tp_traverse        = pyvma_traverse,
+    .tp_clear           = pyvma_clear,
+    .tp_methods         = pyvma_methods,
+    .tp_members         = pyvma_members,
    .tp_as_sequence     = &pyvma_as_seq,
    .tp_as_buffer       = &pyvma_as_buffer,
    .tp_dealloc         = pyvma_dealloc,
@@ -268,10 +365,6 @@ static PyTypeObject PyVMA_Type = {
 ****************/


-#define PyFunc(FUNC, DOC)               \
-static const char FUNC ##_doc[] = DOC;  \
-static PyObject *FUNC
-
 PyFunc(pyfileh_mmap, "mmap(pgoffset, pglen) - map fileh part into memory")
    (PyObject *pyfileh0, PyObject *args)
 {