Commit ca064f75 by Kirill Smelkov

bigarray: Support resizing in-place

In NumPy, ndarray has .resize() but actually it does a whole array
copy into newly allocated larger segment which makes e.g. appending O(n).

For BigArray, we don't have that internal constraint NumPy has - to
keep the array itself contiguously _stored_ (compare to contiguously
_presented_ in memory). So we can have O(1) resize for big arrays.

NOTE having O(1) resize, here is how O(δ) append can be done:

    A                               # ZBigArray e.g. of shape   (N, 3)
    n = len(A)                      # lengh of A's major index  =N
    A.resize((n+δ, A.shape[1:]))    # add δ new entries ; now len(A) =N+δ
    A[-δ:] = <new-data>             # set data for last new δ entries

/cc @klaus
1 parent 929922fa
......@@ -142,6 +142,43 @@ class BigArray(object):
# .base
# ~~~ ndarray-like with different semantics
# resize BigArray in-place
#
# NOTE
#
# - ndarray.resize() works in O(n) time
#
# ( on-growth numpy allocates new memory for whole array and copies data
# there. This is done because numpy.ndarray has to be contiguously stored
# in memory. )
#
# - BigArray.resize() works in O(1) time
#
# ( BigArrays are only mapped to contiguous virtual address-space, and
# storage is organized using separate data blocks. )
#
# NOTE even after BigArray is resized, already-established ndarray views of
# BigArray stay of original size.
def resize(self, new_shape, refcheck=True):
# NOTE refcheck is in args only for numpy API compatibility - as we
# don't move memory we don't need to check anything before resizing.
# for BigArray resizing is just changing .shape - BigFile currently
# works as if it is infinite storage with non-set blocks automatically
# reading as whole-zeros. So
#
# - if array grows, on further mapping we'll map new blocks from
# ._fileh
#
# - if array shrinks, we'll not let clients to map blocks past array
# end.
#
# TODO discard data from backing file on shrinks.
self._init0(new_shape, self.dtype, order='C') # FIXME order hardcoded
# ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM.
# only basic indexing is supported - see numpy/.../arrays.indexing.rst
......
......@@ -20,7 +20,7 @@ from wendelin.bigfile.tests.common_zodb import dbopen, dbclose
from wendelin.bigfile.tests.test_filezodb import kkey, cacheInfo
from persistent import UPTODATE
import transaction
from numpy import dtype, uint8, all
from numpy import dtype, uint8, all, array_equal
def test_zbigarray(tmpdir):
root = dbopen('%s/1.fs' % tmpdir)
......@@ -124,3 +124,38 @@ def test_zbigarray(tmpdir):
assert all(a[33+1:-2] == 0)
assert a[-2] == 98
assert a[-1] == 99
# resize array & append data
A.resize((24*1024*1024,))
assert A.shape == (24*1024*1024,)
assert A.dtype == dtype(uint8)
b = A[:]
assert array_equal(a, b[:16*1024*1024])
b[16*1024*1024] = 100
b[-1] = 255
# commit; reload & verify changes
transaction.commit()
dbclose(root)
del root, a, b, A
root = dbopen('%s/1.fs' % tmpdir)
A = root['zarray']
assert isinstance(A, ZBigArray)
assert A.shape == (24*1024*1024,)
assert A.dtype == dtype(uint8)
a = A[:]
assert all(a[:33] == 0)
assert a[33] == 33
assert all(a[33+1:16*1024*1024-2] == 0)
assert a[16*1024*1024-2] == 98
assert a[16*1024*1024-1] == 99
assert a[16*1024*1024] == 100
assert a[24*1024*1024-1] == 255
......@@ -19,7 +19,7 @@
from wendelin.bigarray import BigArray
from wendelin.bigfile import BigFile
from wendelin.lib.mem import memcpy
from numpy import ndarray, dtype, int32, uint32, uint8, all, arange, multiply, array_equal
from numpy import ndarray, dtype, int32, uint32, uint8, all, zeros, arange, multiply, array_equal
from pytest import raises
......@@ -301,3 +301,63 @@ def test_bigarray_indexing_Nd():
# newaxis - added after at some position(s)
for newaxis in range(3): # 0 - no newaxis
"""
def test_bigarray_resize():
data = zeros(8*PS, dtype=uint32)
f = BigFile_Data(data, PS)
fh = f.fileh_open()
# set first part & ensure it is set correctly
A = BigArray((10,3), uint32, fh)
A[:,:] = arange(10*3, dtype=uint32).reshape((10,3))
a = A[:]
assert array_equal(a.ravel(), arange(10*3, dtype=uint32))
# grow array
A.resize((11,3))
# a as already mapped, should stay the same
assert array_equal(a.ravel(), arange(10*3, dtype=uint32))
# mapping it once again maps it whole with new size
b = A[:]
assert isinstance(b, ndarray)
assert b.shape == (11,3)
assert b.dtype == dtype(uint32)
# head data is the same as a
assert array_equal(a, b[:10,:])
# tail is zeros
assert array_equal(b[10,:], zeros(3, dtype=uint32))
# old mapping stays valid and changes propageate to/from it
assert a[0,0] == 0
assert b[0,0] == 0
a[0,0] = 1
assert b[0,0] == 1
b[0,0] = 2
assert a[0,0] == 2
a[0,0] = 0
assert b[0,0] == 0
assert a[ -1,-1] == 10*3-1
assert b[10-1,-1] == 10*3-1
a[ -1,-1] = 1
assert b[10-1,-1] == 1
b[10-1,-1] = 2
assert a[ -1,-1] == 2
a[ -1,-1] = 10*3-1
assert b[10-1,-1] == 10*3-1
# we cannot access old mapping beyond it's end
assert raises(IndexError, 'a[10,:]')
# we can change tail
b[10,:] = arange(10*3, (10+1)*3)
# map it whole again and ensure we have correct data
c = A[:]
assert array_equal(c.ravel(), arange(11*3, dtype=uint32))
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!