BigArray: An ndarray-like on top of BigFile memory mappings

I.e. something like numpy.memmap for numpy.ndarray and OS files. The whole bigarray cannot be used as a drop-in replacement for numpy arrays, but BigArray _slices_ are real ndarrays and can be used everywhere ndarray can be used, including in C/Fortran code. Slice size is limited by mapping-size (= address-space size) limit, i.e. to ~ max 127TB on Linux/amd64. Changes to bigarray memory are changes to bigfile memory mapping and as such can be discarded or saved back to bigfile using mapping (= BigFileH) dirty discard/writeout interface. For the same reason the whole amount of changes to memory is limited by amount of physical RAM.

BigArray: An ndarray-like on top of BigFile memory mappings
I.e. something like numpy.memmap for numpy.ndarray and OS files. The whole bigarray cannot be used as a drop-in replacement for numpy arrays, but BigArray _slices_ are real ndarrays and can be used everywhere ndarray can be used, including in C/Fortran code. Slice size is limited by mapping-size (= address-space size) limit, i.e. to ~ max 127TB on Linux/amd64. Changes to bigarray memory are changes to bigfile memory mapping and as such can be discarded or saved back to bigfile using mapping (= BigFileH) dirty discard/writeout interface. For the same reason the whole amount of changes to memory is limited by amount of physical RAM.
0c826d5c · Kirill Smelkov · 4174b84a · 0c826d5c · 0c826d5c · 0c826d5c
Commit 0c826d5c authored Apr 03, 2015 by Kirill Smelkov
Showing with 604 additions and 2 deletions

bigarray/__init__.py bigarray/__init__.py +309 -0

bigarray/tests/__init__.py bigarray/tests/__init__.py +1 -0

bigarray/tests/test_basic.py bigarray/tests/test_basic.py +292 -0

setup.py setup.py +2 -2

No files found.
--- a/bigarray/__init__.py
+++ b/bigarray/__init__.py
+# -*- coding: utf-8 -*-
+# BigArray submodule for Wendelin
+# Copyright (C) 2014-2015  Nexedi SA and Contributors.
+#                          Kirill Smelkov <kirr@nexedi.com>
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Open Source Initiative approved licenses and Convey
+# the resulting work. Corresponding source of such a combination shall include
+# the source code for all other software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+
+"""BigArrays are NumPy ndarray-like interface on top of BigFile memory mappings.
+
+I.e. something like numpy.memmap for numpy.ndarray and OS files. The whole
+bigarray cannot be used as a drop-in replacement for numpy arrays, but BigArray
+_slices_ are real ndarrays and can be used everywhere ndarray can be used,
+including in C/Fortran code. Slice size is limited by mapping-size (=
+address-space size) limit, i.e. to ~ max 127TB on Linux/amd64.
+
+Changes to bigarray memory are changes to bigfile memory mapping and as such
+can be discarded or saved back to bigfile using mapping (= BigFileH) dirty
+discard/writeout interface. For ZBigFile that means changes can be
+discarded & saved via transactions.
+
+For the same reason the whole amount of changes to memory is limited by amount
+of physical RAM.
+"""
+
+from __future__ import print_function
+from numpy import ndarray, dtype, multiply, sign, newaxis
+
+
+pagesize = 2*1024*1024 # FIXME hardcoded, TODO -> fileh.ram.pagesize
+
+class BigArray(object):
+    # numpy.ndarray like
+    # XXX can't use slots, because that would create "multiple bases have
+    #     instance lay-out conflict" with Persistent for ZBigArray
+    """
+    __slots__ = (
+        '_dtype',       # items data type  (numpy.dtype)
+        '_shape',       # []int
+
+        # ._stridev       []int
+        #                           j-1
+        #                   strj = prod shapej  # XXX *dtype.itemsize
+        # XXX on start translates to strides
+        # .order            'C' or 'F'  XXX other orders?
+
+        '_v_fileh',     # bigfile memory mapping for this array
+    )
+    """
+
+
+    # TODO doc -> see ndarray
+    # NOTE does not accept strides
+    # TODO handle order
+    # NOTE please be cooperative to ZBigArray and name helper data members starting with _v_
+    def __init__(self, shape, dtype_, bigfileh, order='C'):
+        self._init0(shape, dtype_, order)
+        self._v_fileh = bigfileh
+
+
+    # __init__ part without fileh
+    def _init0(self, shape, dtype_, order):
+        self._dtype = dtype(dtype_)
+        self._shape = shape
+        # TODO +offset ?
+        # TODO +strides ?
+
+        if order != 'C':
+            raise NotImplementedError('Order %s not supported' % order)
+
+
+        # shape, dtype -> ._stridev
+        # TODO take dtype.alignment into account ?
+        # NOTE (1,) so that multiply.reduce return 1 (not 1.0) for []
+        self._stridev = tuple( multiply.reduce((1,) + shape[i+1:]) * self._dtype.itemsize  \
+                                    for i in range(len(shape)) )
+
+
+
+    # ~~~ ndarray-like attributes
+
+    @property
+    def data(self):
+        raise TypeError("Direct access to data for BigArray is forbidden")
+
+    @property
+    def strides(self):
+        return self._stridev
+
+    @property
+    def dtype(self):
+        # TODO support assigning new dtype
+        return self._dtype
+
+    @property
+    def shape(self):
+        # TODO support assigning new shape
+        return self._shape
+
+    @property
+    def size(self):
+        return multiply.reduce(self._shape)
+
+    def __len__(self):
+        # lengths of the first axis
+        return self._shape[0]    # FIXME valid only for C-order
+
+    @property
+    def itemsize(self):
+        return self._dtype.itemsize
+
+    @property
+    def nbytes(self):
+        return self.itemsize * self.size
+
+    @property
+    def ndim(self):
+        return len(self._shape)
+
+
+    def view(self, dtype=None, type=None):
+        raise NotImplementedError   # TODO
+
+
+    # TODO more ndarray-like attributes
+    #   .T
+    #   .flags  <--
+    #   .flat
+    #   .imag
+    #   .real
+    #   .base
+
+
+
+    # ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM.
+    #     only basic indexing is supported - see numpy/.../arrays.indexing.rst
+    #
+    #     NOTE it would be good if we could reuse prepare_index() &
+    #     npy_index_info from numpy/mapping.[ch]
+
+
+    # access to mapping via property, so that children could hook into it
+    # (e.g. ZBigArray creates mapping lazily on 1st access)
+    @property
+    def _fileh(self):
+        return self._v_fileh
+
+    def __getitem__(self, idx):
+        # NOTE basic indexing means idx = tuple(slice | int) + sugar(newaxis, ellipsis)
+        #print('\n__getitem__', idx)
+
+        # handle 1d slices uniformly with Nd
+        if not isinstance(idx, tuple):
+            idx = (idx,)
+
+        idx = list(idx)
+
+        # expand ellipsis
+        try:
+            ellidx = idx.index(Ellipsis)
+        except ValueError:
+            # no ellipsis - nothing to do
+            pass
+        else:
+            # ellipsis present - check there is only 1
+            if idx[ellidx+1,:].count(Ellipsis):
+                raise IndexError('multiple ellipsis not allowed')
+
+            # and expand with `:,:,...` in place of ellipsis
+            # (no need to check nexpand < 0 -- [...]*-1 = []
+            nexpand = len(self.shape) - (len(idx) - idx.count(newaxis) - 1)
+            idx[ellidx:ellidx] = [slice(None)] * nexpand
+
+        #print('...\t->', idx)
+
+        # expand idx with : to match shape
+        # (no need to check for expanding e.g. -1 times -- [...]*-1 = []
+        idx.extend( [slice(None)] * (len(self.shape) - len(idx) - idx.count(newaxis)) )
+
+        #print('expand\t->', idx)
+
+
+        # 1) for newaxis - remember we'll need to increase dimensionality
+        #    there after we take view
+        #
+        # 2) for scalars - convert `i -> i:i+1` and remember we'll need to reduce
+        #    dimensionality at that position
+        dim_adjust = [slice(None)] * len(idx)   # :,:,:,...
+        for i in range(len(idx)):
+            if idx[i] is newaxis:
+                dim_adjust[i] = newaxis             # [newaxis] will increase ndim
+
+            elif not isinstance(idx[i], slice):
+                _ = idx[i]
+                if _ < 0:
+                    _ = self.shape[i] + _   # -1 -> N-1  (or else -1:-1+1 -> -1:0 = empty)
+                idx[i] = slice(_, _+1)
+                dim_adjust[i] = 0                   # [0] will reduce ndim
+
+        # if it stays list, and all elements are int, numpy activates advanced indexing
+        dim_adjust = tuple(dim_adjust)
+        #print('scalars\t->', idx)
+        #print('dim_adj\t->', dim_adjust)
+
+
+        # filter-out newaxis from index - so we first work with concrete positions
+        try:
+            # XXX not optimal
+            while 1:
+                idx.remove(newaxis)
+        except ValueError:
+            # no more newaxis - ok
+            pass
+
+        #print('ønewax\t->', idx)
+
+        # ensure there are no more indices than we can serve
+        if len(idx) > len(self.shape):
+            raise IndexError('too many indices')
+
+        # above we cared to expand to shape, if needed
+        assert len(idx) == len(self.shape)
+
+
+        # now we have:
+        # - idx and shape are of the same size
+        # - idx contains only slice objects
+        # - dim_adjust was prepared for taking scalar and newaxis indices into
+        #   account after we take ndarray view
+
+        # major index / stride
+        # FIXME assumes C ordering
+        idx0    = idx[0]
+        stride0 = self._stridev[0]
+        shape0  = self._shape[0]
+
+        # major idx start/stop/stride
+        idx0_start, idx0_stop, idx0_stride = idx0.indices(shape0)
+
+        #print('idx0:\t', idx0, '-> [%s:%s:%s]' % (idx0_start, idx0_stop, idx0_stride))
+        #print('strid0:\t', stride0)  #, self._stridev
+        #print('shape0:\t', shape0)   #, self._shape
+
+
+        # nitems in major row
+        nitems0 = (idx0_stop - idx0_start - sign(idx0_stride)) // idx0_stride + 1
+        #print('nitem0:\t', nitems0)
+
+        # if major row is "empty" slice, we can return right away without creating vma.
+        # e.g. 10:5:1, 5:10:-1, 5:5,  size+100:size+200  ->  []
+        if nitems0 <= 0:
+            return ndarray((0,) + self._shape[1:], self._dtype)
+
+        # major slice -> in bytes
+        byte0_start  = idx0_start  * stride0
+        byte0_stop   = idx0_stop   * stride0
+        byte0_stride = idx0_stride * stride0
+
+        # major slice -> in file pages, always increasing, inclusive
+        page0_min  = min(byte0_start, byte0_stop+byte0_stride) // pagesize # TODO -> fileh.pagesize
+        page0_max  = max(byte0_stop-byte0_stride, byte0_start) // pagesize # TODO -> fileh.pagesize
+
+
+        # ~~~ mmap file part corresponding to full major slice into memory
+        vma0 = self._fileh.mmap(page0_min, page0_max-page0_min+1)
+
+
+        # first get ndarray view with only major slice specified and rest indices being ":"
+        view0_shape   = (nitems0,) + self._shape[1:]
+        view0_offset  = byte0_start - page0_min * pagesize # TODO -> fileh.pagesize
+        view0_stridev = (byte0_stride,) + self._stridev[1:]
+        #print('view0_shape:\t', view0_shape, self.shape)
+        #print('view0_offset:\t', view0_offset)
+        #print('len(vma0):\t', len(vma0))
+        view0 = ndarray(view0_shape, self._dtype, vma0, view0_offset, view0_stridev)
+
+        # now take into accont indices after major one
+        view  = view0[(slice(None),) + tuple(idx[1:])]
+
+        #print('view0:\t', view0.shape)
+        #print('view:\t',  view.shape)
+
+        #print('View:\t',  view)
+        #print('view/d:\t', view[dim_adjust])
+        # and finally take dimenstions adjust into account and we are done
+        return view[dim_adjust]
+
+
+    def __setitem__(self, idx, v):
+        # TODO idx = int, i.e. scalar assign
+
+        # reprsent changed area by ndarray via getitem, then leverage ndarray assignment
+        a = self.__getitem__(idx)
+        a[:] = v
+
+
+    # XXX __array__(self) = self[:]     ?
+    # (for numpy functions to accept bigaray as-is (if size permits))
--- a/bigarray/tests/__init__.py
+++ b/bigarray/tests/__init__.py
+../../bigfile/tests/__init__.py
\ No newline at end of file
--- a/bigarray/tests/test_basic.py
+++ b/bigarray/tests/test_basic.py
+# Wendeling.core.bigarray | Basic tests
+# Copyright (C) 2014-2015  Nexedi SA and Contributors.
+#                          Kirill Smelkov <kirr@nexedi.com>
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Open Source Initiative approved licenses and Convey
+# the resulting work. Corresponding source of such a combination shall include
+# the source code for all other software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+
+from wendelin.bigarray import BigArray
+from wendelin.bigfile import BigFile
+from wendelin.lib.mem import memcpy
+from numpy import ndarray, dtype, int32, uint32, uint8, all, arange, multiply, array_equal
+
+from pytest import raises
+
+
+# Synthetic bigfile that just loads zeros, and ignores writes (= a-la /dev/zero)
+class BigFile_Zero(BigFile):
+
+    def loadblk(self, blk, buf):
+        # Nothing to do here - the memory buf obtained from OS comes pre-cleared
+        # XXX reenable once/if memory comes uninitialized here
+        return
+
+    def storeblk(self, blk, buf):
+        return
+
+
+PS = 2*1024*1024    # FIXME hardcoded, TODO -> ram.pagesize
+
+
+# basic ndarray-compatibility attributes of BigArray
+def test_bigarray_basic():
+    Z  = BigFile_Zero(PS)
+    Zh = Z.fileh_open()
+
+    A = BigArray((10,3), int32, Zh)
+
+    raises(TypeError, "A.data")
+    assert A.strides    == (12, 4)
+    assert A.dtype      == dtype(int32)
+    # XXX .flags?
+    # XXX .flat?    (non-basic)
+    # XXX .imag?    (non-basic)
+    # XXX .real?    (non-basic)
+    assert A.size       == 10*3
+    assert len(A)       == 10
+    assert A.itemsize   == 4
+    assert A.nbytes     == 4*10*3
+    assert A.ndim       == 2
+    assert A.shape      == (10,3)
+    # XXX .ctypes   (non-basic)
+    # TODO .base
+
+
+
+# DoubleGet(obj1, obj2)[key] -> obj1[key], obj2[key]
+class DoubleGet:
+    def __init__(self, obj1, obj2):
+        self.obj1 = obj1
+        self.obj2 = obj2
+
+    def __getitem__(self, key):
+        return self.obj1[key], self.obj2[key]
+
+
+# getitem/setitem (1d case)
+def test_bigarray_indexing_1d():
+    Z  = BigFile_Zero(PS)
+    Zh = Z.fileh_open()
+
+    A = BigArray((10*PS,), uint8, Zh)
+
+    # ndarray of the same shape - we'll use it to get slices and compare result
+    # shape/stride against BigArray.__getitem__
+    A_= ndarray ((10*PS,), uint8)
+
+    # AA[key] -> A[key], A_[key]
+    AA = DoubleGet(A, A_)
+
+
+    # "empty" slices
+    assert A[10:5:1]        .size == 0
+    assert A[5:10:-1]       .size == 0
+    assert A[5:5]           .size == 0
+    assert A[100*PS:200*PS] .size == 0
+
+
+    # whole array
+    a, _ = AA[:]
+    assert isinstance(a, ndarray)
+    assert a.dtype   == dtype(uint8)
+    assert a.shape   == _.shape
+    assert a.strides == _.strides
+
+    assert a[0] == 0
+    assert a[5*PS] == 0
+    assert a[10*PS-1] == 0
+
+
+    # overlaps with a
+    b, _ = AA[4*PS:]
+    assert isinstance(b, ndarray)
+    assert b.dtype   == dtype(uint8)
+    assert b.shape   == _.shape
+    assert b.strides == _.strides
+
+    assert b[0] == 0
+    assert b[1*PS] == 0
+    assert b[5*PS-1] == 0
+
+    # a <-> b
+    assert b[1*PS] == 0
+    a[5*PS] = 1
+    assert b[1*PS] == 1
+
+
+    # non-pagesize aligned slice
+    c, _ = AA[4*PS+3 : 9*PS-3]
+    assert isinstance(c, ndarray)
+    assert c.dtype   == dtype(uint8)
+    assert c.shape   == _.shape
+    assert c.strides == _.strides
+
+    assert c[0]  == 0
+    assert c[-1] == 0
+
+    # a <-> b <-> c
+    assert b[3] == 0
+    assert a[4*PS+3] == 0
+    c[0] = 3
+    assert b[3] == 3
+    assert a[4*PS+3] == 3
+
+    assert b[5*PS-4] == 0
+    assert a[9*PS-4] == 0
+    c[-1] = 99
+    assert b[5*PS-4] == 99
+    assert a[9*PS-4] == 99
+
+    # negative stride
+    d, _ = AA[9*PS+1:4*PS-1:-1]
+    assert isinstance(d, ndarray)
+    assert d.dtype   == dtype(uint8)
+    assert d.shape   == _.shape
+    assert d.strides == _.strides
+
+    assert all(d[:5] == 0)
+    assert d[5] == 99               # c[-1]
+    assert all(d[6:-(PS+1)] == 0)
+    assert d[-(PS+1)] == 1          # a[5*PS]
+    assert all(d[-PS:-4] == 0)
+    assert d[-4] == 3               # c[0]
+    assert all(d[-3:] == 0)
+
+    # like c, but stride > 1
+    e, _ = AA [4*PS+3 : 9*PS-3 : 7]
+    assert isinstance(e, ndarray)
+    assert e.dtype   == dtype(uint8)
+    assert e.shape   == _.shape
+    assert e.strides == _.strides
+    c[0] = 4
+    assert e[0] == c[0]
+    c[0] = 5
+    assert e[0] == c[0]
+    c[7] = 7
+    assert e[1] == c[7]
+    c[7] = 8
+    assert e[1] == c[7]
+    # TODO check more
+
+    # like d, but stride < -1
+    f, _ = AA[9*PS+1:4*PS-1:-11]
+    assert isinstance(f, ndarray)
+    assert f.dtype   == dtype(uint8)
+    assert f.shape   == _.shape
+    assert f.strides == _.strides
+    d[0] = 11
+    assert f[0] == d[0]
+    d[0] = 12
+    assert f[0] == d[0]
+    d[11] = 13
+    assert f[1] == d[11]
+    d[11] = 14
+    assert f[1] == d[11]
+    # TODO check more
+
+
+    # setitem
+    A[2*PS+1:3*PS+2] = 5
+    assert all(a[2*PS+1 : 3*PS+2] == 5)
+    assert a[2*PS] == 0
+    assert a[3*PS+3] == 0
+
+    A[2*PS+2:2*PS+5] = [6,7,8]
+    assert a[2*PS+0] == 0
+    assert a[2*PS+1] == 5
+    assert a[2*PS+2] == 6
+    assert a[2*PS+3] == 7
+    assert a[2*PS+4] == 8
+    assert a[2*PS+5] == 5
+    assert a[2*PS+6] == 5
+
+    assert raises(ValueError, 'A[:4] = range(5)')
+
+
+# given dimension length n, yield index variants to test
+def indices_to_test(n):
+    # ":"
+    yield slice(None)
+
+    # int
+    yield 0
+    yield -1
+    yield n//2
+
+    # start:stop:stride
+    yield slice(1,-1)
+    yield slice(n//4+1, n*3//4-1, 2)
+    yield slice(n//5+1, n*4//5-1, 3)
+
+
+# geven shape, yield all Nd idx variant, where every index iterates full indices_to_test
+def idx_to_test(shape, idx_prefix=()):
+    leaf = len(shape) <= 1
+    for i in indices_to_test(shape[0]):
+        idx = idx_prefix + (i,)
+        if leaf:
+            yield idx
+        else:
+            # = yield from
+            for _ in idx_to_test(shape[1:], idx):
+                yield _
+
+
+# getitem/setitem (Nd case)
+def test_bigarray_indexing_Nd():
+    # shape of tested array - all primes, total size for uint32 ~ 7 2M pages
+    # XXX even less dimensions (to speed up tests)?
+    shape = tuple(reversed( (17,23,101,103) ))
+
+    # test data - all items are unique - so we can check array by content
+    # NOTE +PS so that BigFile_Data has no problem loading last blk
+    #      (else data slice will be smaller than buf)
+    data  = arange(multiply.reduce(shape) + PS, dtype=uint32)
+
+    # synthetic bigfile that loads data from `data`
+    class BigFile_Data(BigFile):
+        def loadblk(self, blk, buf):
+            datab = data.view(uint8)
+            x = datab[self.blksize * blk : self.blksize * (blk+1)]
+            memcpy(buf, x)
+
+        def storeblk(self, blk, buf):
+            raise RuntimeError('tests should not try to change test data')
+
+    f  = BigFile_Data(PS)
+    fh = f.fileh_open()
+
+    A  = BigArray(shape, uint32, fh)                    # bigarray with test data and shape
+    A_ = data[:multiply.reduce(shape)].reshape(shape)   # ndarray  ----//----
+
+
+    # now just go over combinations of various slice at each dimension, and see
+    # whether slicing result is the same ndarray would do.
+    for idx in idx_to_test(shape):
+        a  = A [idx]
+        a_ = A_[idx]
+
+        assert array_equal(a, a_)
+
+    # TODO ... -> expanded (0,1,2,negative), rejected if many
+    # TODO newaxis
+    # TODO nidx < len(shape)
+    # TODO empty slice in major row, empty slice in secondary row
+    """
+    # ellipsis  - take some idx[a:b] and replace it by ...
+    for ellipsis in range(2):   # 0 - no ellipsis
+
+        # newaxis   - added after at some position(s)
+        for newaxis in range(3):    # 0 - no newaxis
+    """
--- a/setup.py
+++ b/setup.py
@@ -182,9 +182,9 @@ setup(
    packages    = ['wendelin'] + ['wendelin.%s' % _ for _ in
                        find_packages(exclude='3rdparty')],
    install_requires = [
-                   'numpy',     # lib/mem
+                   'numpy',     # BigArray + its children

-                   # for ZBigFile
+                   # for ZBigFile / ZBigArray
                   # ( NOTE: ZODB3 3.11 just pulls in latest ZODB _4_, so this way
                   #   specifying ZODB _3_ we allow external requirements to
                   #   specify either to use e.g. ZODB3.10 or ZODB4 )