RAMArray

RAMArray is compatible to ZBigArray in API and semantic, but stores its data in RAM only. It is useful in situations where ZBigArray compatible data type is needed, but the amount of data is small and the data itself is needed only temporarily - e.g. in a simulation. Please see details in individual patches. Original merge request by @klaus (!8). /cc @Tyagov /reviewed-on !9

RAMArray
RAMArray is compatible to ZBigArray in API and semantic, but stores its data in RAM only. It is useful in situations where ZBigArray compatible data type is needed, but the amount of data is small and the data itself is needed only temporarily - e.g. in a simulation. Please see details in individual patches. Original merge request by @klaus (!8). /cc @Tyagov /reviewed-on !9
99b91c84 · Kirill Smelkov · 318efce0 · fc9b69d8 · 99b91c84 · 99b91c84
Commit 99b91c84 authored Oct 11, 2018 by Kirill Smelkov
Show whitespace changes
Inline Side-by-side

Showing with 198 additions and 40 deletions

bigarray/__init__.py bigarray/__init__.py +3 -2

bigarray/array_ram.py bigarray/array_ram.py +135 -0

bigarray/tests/test_basic.py bigarray/tests/test_basic.py +60 -38

No files found.
--- a/bigarray/__init__.py
+++ b/bigarray/__init__.py
@@ -692,8 +692,9 @@ class ArrayRef(object):
            # 2) or it is a VMA created from under BigArray which will be
            # treated as top-level too, and corrected for in the end.
            basetype = type(base)
-            if basetype.__module__ + "." + basetype.__name__ == "_bigfile.VMA":
-            #if isinstance(base, _bigfile.VMA):  XXX _bigfile does not expose VMA
+            basepath = basetype.__module__ + "." + basetype.__name__
+            if basepath in ("_bigfile.VMA", "wendelin.bigarray.array_ram._VMA"):
+            #if isinstance(base, (_bigfile.VMA, array_ram._VMA)):  XXX _bigfile does not expose VMA
                bigvma = base
            break


--- a/bigarray/array_ram.py
+++ b/bigarray/array_ram.py
+# -*- coding: utf-8 -*-
+# Wendelin.bigarray | RAM Array
+# Copyright (C) 2014-2018  Nexedi SA and Contributors.
+#                          Klaus Wölfel <klaus@nexedi.com>
+#                          Kirill Smelkov <kirr@nexedi.com>
+#
+# This program is free software: you can Use, Study, Modify and Redistribute
+# it under the terms of the GNU General Public License version 3, or (at your
+# option) any later version, as published by the Free Software Foundation.
+#
+# You can also Link and Combine this program with other software covered by
+# the terms of any of the Free Software licenses or any of the Open Source
+# Initiative approved licenses and Convey the resulting work. Corresponding
+# source of such a combination shall include the source code for all other
+# software used.
+#
+# This program is distributed WITHOUT ANY WARRANTY; without even the implied
+# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See COPYING file for full licensing terms.
+# See https://www.nexedi.com/licensing for rationale and options.
+
+"""Module array_ram provides RAMArray that mimics ZBigArray, but keeps data in RAM.
+
+RAMArray mimics ZBigArray API and semantic, but keeps data in RAM.
+
+RAMArray should be used for temporary objects only - its data is not
+persisted in any way.
+"""
+
+from wendelin import bigarray
+import mmap, os, threading, tempfile, errno
+import numpy as np
+
+
+# RAMArray mimics ZBigArray API and semantic, but keeps data in RAM.
+class RAMArray(bigarray.BigArray):
+
+    def __init__(self, shape, dtype, order='C'):
+        # the whole functionality of RAMArray is in _RAMFileH
+        super(RAMArray, self).__init__(shape, dtype, _RAMFileH(), order)
+
+
+# _RAMFileH mimics _ZBigFileH with data kept in RAM in /dev/shm.
+#
+# ( we have to use mmap from a file in /dev/shm, not e.g. plain ndarray, because
+#   BigArray append semantic is to keep aliasing the data from previously-created
+#   views, and since ndarray.resize copies data, that property would not be preserved. )
+class _RAMFileH(object):
+
+    # we mmap data as read/write by default.
+    # tests can overwrite this to be e.g. only PROT_READ to catch incorrect modifications.
+    _prot = mmap.PROT_READ | mmap.PROT_WRITE
+
+    def __init__(self):
+        # create temporary file in dev/shm and unlink it.
+        # ._fh keeps opened file descriptor to it.
+        fh, path = tempfile.mkstemp(dir="/dev/shm", prefix="ramfile.")
+        os.unlink(path)
+        self._fh = fh
+
+        # mmap(2) allows mmaping past the end, but python's mmap does not.
+        # we workaround it with explicitly growing file as needed.
+        # however we need to protect against races between concurrent .mmap() calls.
+        # ._mmapmu is used for this.
+        self._mmapmu = threading.Lock()
+
+    def mmap(self, pgoffset, pglen):
+        offset = pgoffset * bigarray.pagesize
+        length = pglen    * bigarray.pagesize
+
+        with self._mmapmu:
+            # grow file, if needed, to cover mmaped range
+            needsize = offset + length
+            st = os.fstat(self._fh)
+            if st.st_size < needsize:
+                try:
+                    os.ftruncate(self._fh, needsize)
+                except OverflowError as e:
+                    # OverflowError: Python int too large to convert to C long
+                    raise MemoryError(e)
+
+            # create requested mmap
+            try:
+                return _VMA(self._fh, pgoffset, pglen, bigarray.pagesize, self._prot)
+
+            # ENOMEM -> MemoryError (similarly to BigFile)
+            except mmap.error as e:
+                if e.errno == errno.ENOMEM:
+                    raise MemoryError(e)
+                raise
+
+    def __del__(self):
+        os.close(self._fh)
+
+
+# _VMA mimics PyVMA.
+#
+# it is just mmap.mmap, but, similarly to PyVMA, allows to set .pyuser and
+# exposes other PyVMA compatible attributes.
+class _VMA(mmap.mmap):
+    __slots__ = ['_pgoffset', '_pglen', '_pagesize', 'pyuser']
+
+    def __new__(cls, fh, pgoffset, pglen, pagesize, prot):
+        vma = mmap.mmap.__new__(cls,
+                fh,
+                length  = pglen * pagesize,
+                flags   = mmap.MAP_SHARED,
+                prot    = prot,
+                offset  = pgoffset * pagesize)
+
+        vma._pgoffset   = pgoffset
+        vma._pglen      = pglen
+        vma._pagesize   = pagesize
+        return vma
+
+    def pagesize(self):
+        return self._pagesize
+
+    def filerange(self):
+        return (self._pgoffset, self._pglen)
+
+    @property
+    def addr_start(self):
+        # find out address where we are mmapped
+        a = np.ndarray(shape=(len(self),), dtype=np.uint8, buffer=self)
+        adata = a.__array_interface__.get('data')
+        assert adata is not None, "TODO __array_interface__.data = None"
+        assert isinstance(adata, tuple), "TODO __array_interface__.data is %r" % (adata,)
+        # adata is (data, readonly)
+        return adata[0]
+
+    @property
+    def addr_stop(self):
+        return self.addr_start + len(self)
--- a/bigarray/tests/test_basic.py
+++ b/bigarray/tests/test_basic.py
@@ -20,6 +20,7 @@
 # See https://www.nexedi.com/licensing for rationale and options.

 from wendelin.bigarray import BigArray, ArrayRef, _flatbytev
+from wendelin.bigarray.array_ram import _RAMFileH
 from wendelin.bigfile import BigFile
 from wendelin.lib.mem import memcpy
 from wendelin.lib.calc import mul
@@ -28,7 +29,8 @@ from numpy import ndarray, dtype, int64, int32, uint32, int16, uint8, all, zeros
 from numpy.lib.stride_tricks import as_strided
 import numpy

-from pytest import raises
+import os, mmap
+from pytest import raises, fixture


 # Synthetic bigfile that just loads zeros, and ignores writes (= a-la /dev/zero)
@@ -59,19 +61,49 @@ class BigFile_Data(BigFile):
        memcpy(self.datab[self.blksize * blk : self.blksize * (blk+1)], buf)


-# synthetic bigfile that only loads data from numpy array
-class BigFile_Data_RO(BigFile_Data):
-    def storeblk(self, blk, buf):
+PS = 2*1024*1024    # FIXME hardcoded, TODO -> ram.pagesize
+
+# tBigFile provides .fopen() to fileh_open a big file handle via ^^^ BigFile_*
+class tBigFile:
+    def fopen(self, data=None, readonly=False):
+        if data is None:
+            bigf = BigFile_Zero(PS)
+        else:
+            bigf = BigFile_Data(data, PS)
+
+        if readonly:
+            def _(self, blk, buf):
                raise RuntimeError('tests should not try to change test data')
+            bigf.storeblk = _

+        return bigf.fileh_open()
+
+# tRAM provides .fopen() to open a file handle via _RAMFileH.
+class tRAM:
+    def fopen(self, data=None, readonly=False):
+        fh = _RAMFileH()
+        if data is not None:
+            fh2 = os.dup(fh._fh)    # fdopen takes ownershipf of fd and closes it
+            with os.fdopen(fh2, 'wb') as f:
+                f.write(data)
+
+        if readonly:
+            fh._prot = mmap.PROT_READ
+
+        return fh
+
+# testbig is fixture that provides .fopen(...) to open a big file handle from
+# ^^^ BigFile_* or correspondingly from RAM.
+@fixture(scope="module", params=[tBigFile, tRAM])
+def testbig(request):
+    cls = request.param
+    yield cls()

-PS = 2*1024*1024    # FIXME hardcoded, TODO -> ram.pagesize


 # make sure we don't let dtype with object to be used with BigArray
-def test_bigarray_noobject():
-    Z  = BigFile_Zero(PS)
-    Zh = Z.fileh_open()
+def test_bigarray_noobject(testbig):
+    Zh = testbig.fopen()

    # NOTE str & unicode are fixed-size types - if size is not explicitly given
    # it will become S0 or U0
@@ -81,9 +113,8 @@ def test_bigarray_noobject():


 # basic ndarray-compatibility attributes of BigArray
-def test_bigarray_basic():
-    Z  = BigFile_Zero(PS)
-    Zh = Z.fileh_open()
+def test_bigarray_basic(testbig):
+    Zh = testbig.fopen()

    A = BigArray((10,3), int32, Zh)

@@ -143,9 +174,8 @@ class DoubleCheck(DoubleGet):


 # getitem/setitem (1d case)
-def test_bigarray_indexing_1d():
-    Z  = BigFile_Zero(PS)
-    Zh = Z.fileh_open()
+def test_bigarray_indexing_1d(testbig):
+    Zh = testbig.fopen()

    A = BigArray((10*PS,), uint8, Zh)

@@ -303,12 +333,11 @@ def test_bigarray_indexing_1d():


 # indexing where accessed element overlaps edge between pages
-def test_bigarray_indexing_pageedge():
+def test_bigarray_indexing_pageedge(testbig):
    shape = (10, PS-1)
    data  = arange(mul(shape), dtype=uint32).view(uint8)    # NOTE 4 times bigger than uint8

-    f  = BigFile_Data_RO(data, PS)
-    fh = f.fileh_open()
+    fh = testbig.fopen(data, readonly=True)

    A  = BigArray(shape, uint8, fh)                     # bigarray with test data and shape
    A_ = data[:mul(shape)].reshape(shape)               # ndarray  ----//----
@@ -325,8 +354,7 @@ def test_bigarray_indexing_pageedge():


    shape = (10, PS+1)
-    f  = BigFile_Data_RO(data, PS)
-    fh = f.fileh_open()
+    fh = testbig.fopen(data, readonly=True)

    A  = BigArray(shape, uint8, fh)
    A_ = data[:mul(shape)].reshape(shape)
@@ -371,7 +399,7 @@ def idx_to_test(shape, idx_prefix=()):


 # getitem/setitem (Nd case)
-def test_bigarray_indexing_Nd():
+def test_bigarray_indexing_Nd(testbig):
    # shape of tested array - all primes, total size for uint32 ~ 7 2M pages
    # XXX even less dimensions (to speed up tests)?
    shape = tuple(reversed( (17,23,101,103) ))
@@ -381,8 +409,7 @@ def test_bigarray_indexing_Nd():
    #      (else data slice will be smaller than buf)
    data  = arange(mul(shape) + PS, dtype=uint32)

-    f  = BigFile_Data_RO(data, PS)
-    fh = f.fileh_open()
+    fh = testbig.fopen(data, readonly=True)

    for order in ('C', 'F'):
        A  = BigArray(shape, uint32, fh, order=order)       # bigarray with test data and shape
@@ -440,10 +467,9 @@ def test_bigarray_indexing_Nd():
        """


-def test_bigarray_resize():
+def test_bigarray_resize(testbig):
    data = zeros(8*PS, dtype=uint32)
-    f   = BigFile_Data(data, PS)
-    fh  = f.fileh_open()
+    fh  = testbig.fopen(data)

    # set first part & ensure it is set correctly
    A   = BigArray((10,3), uint32, fh)
@@ -507,11 +533,10 @@ def arange32_f(start, stop, dtype=None):
    return arange(start*3*2, stop*3*2, dtype=dtype).reshape(2,3,(stop-start), order='F')
    #return arange(start*3*2, stop*3*2, dtype=dtype).reshape(2,3,(stop-start))

-def test_bigarray_append():
+def test_bigarray_append(testbig):
    for order in ('C', 'F'):
        data = zeros(8*PS, dtype=uint32)
-        f   = BigFile_Data(data, PS)
-        fh  = f.fileh_open()
+        fh  = testbig.fopen(data)

        arange32 = {'C': arange32_c, 'F': arange32_f} [order]

@@ -552,9 +577,8 @@ def test_bigarray_append():



-def test_bigarray_list():
-    Z  = BigFile_Zero(PS)
-    Zh = Z.fileh_open()
+def test_bigarray_list(testbig):
+    Zh = testbig.fopen()
    A = BigArray((10,), uint8, Zh)

    # the IndexError for out-of-bound scalar access should allow, though
@@ -564,9 +588,8 @@ def test_bigarray_list():
    assert l == [0]*10


-def test_bigarray_to_ndarray():
-    Z  = BigFile_Zero(PS)
-    Zh = Z.fileh_open()
+def test_bigarray_to_ndarray(testbig):
+    Zh = testbig.fopen()
    A = BigArray((10,), uint8, Zh)

    # without IndexError on out-of-bound scalar access, the following
@@ -594,7 +617,7 @@ def test_bigarray_to_ndarray():



-def test_arrayref():
+def test_arrayref(testbig):
    # test data - all items are unique - so we can check array by content
    data = zeros(PS, dtype=uint8)
    data32 = data.view(uint32)
@@ -683,8 +706,7 @@ def test_arrayref():
    # data_ is the same as data but shifted to exercise vma and vma->broot offsets calculation.
    data_ = zeros(8*PS, dtype=uint8)
    data_[2*PS-1:][:PS] = data
-    f  = BigFile_Data_RO(data_, PS)
-    fh = f.fileh_open()
+    fh = testbig.fopen(data_, readonly=True)
    A  = BigArray(data_.shape, data_.dtype, fh)
    assert array_equal(A[2*PS-1:][:PS], data)