Commit 0c826d5c authored by Kirill Smelkov's avatar Kirill Smelkov

BigArray: An ndarray-like on top of BigFile memory mappings

I.e. something like numpy.memmap for numpy.ndarray and OS files. The whole
bigarray cannot be used as a drop-in replacement for numpy arrays, but BigArray
_slices_ are real ndarrays and can be used everywhere ndarray can be used,
including in C/Fortran code. Slice size is limited by mapping-size (=
address-space size) limit, i.e. to ~ max 127TB on Linux/amd64.

Changes to bigarray memory are changes to bigfile memory mapping and as such
can be discarded or saved back to bigfile using mapping (= BigFileH) dirty
discard/writeout interface.

For the same reason the whole amount of changes to memory is limited by amount
of physical RAM.
parent 4174b84a
# -*- coding: utf-8 -*-
# BigArray submodule for Wendelin
# Copyright (C) 2014-2015 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Open Source Initiative approved licenses and Convey
# the resulting work. Corresponding source of such a combination shall include
# the source code for all other software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
"""BigArrays are NumPy ndarray-like interface on top of BigFile memory mappings.
I.e. something like numpy.memmap for numpy.ndarray and OS files. The whole
bigarray cannot be used as a drop-in replacement for numpy arrays, but BigArray
_slices_ are real ndarrays and can be used everywhere ndarray can be used,
including in C/Fortran code. Slice size is limited by mapping-size (=
address-space size) limit, i.e. to ~ max 127TB on Linux/amd64.
Changes to bigarray memory are changes to bigfile memory mapping and as such
can be discarded or saved back to bigfile using mapping (= BigFileH) dirty
discard/writeout interface. For ZBigFile that means changes can be
discarded & saved via transactions.
For the same reason the whole amount of changes to memory is limited by amount
of physical RAM.
"""
from __future__ import print_function
from numpy import ndarray, dtype, multiply, sign, newaxis
pagesize = 2*1024*1024 # FIXME hardcoded, TODO -> fileh.ram.pagesize
class BigArray(object):
# numpy.ndarray like
# XXX can't use slots, because that would create "multiple bases have
# instance lay-out conflict" with Persistent for ZBigArray
"""
__slots__ = (
'_dtype', # items data type (numpy.dtype)
'_shape', # []int
# ._stridev []int
# j-1
# strj = prod shapej # XXX *dtype.itemsize
# XXX on start translates to strides
# .order 'C' or 'F' XXX other orders?
'_v_fileh', # bigfile memory mapping for this array
)
"""
# TODO doc -> see ndarray
# NOTE does not accept strides
# TODO handle order
# NOTE please be cooperative to ZBigArray and name helper data members starting with _v_
def __init__(self, shape, dtype_, bigfileh, order='C'):
self._init0(shape, dtype_, order)
self._v_fileh = bigfileh
# __init__ part without fileh
def _init0(self, shape, dtype_, order):
self._dtype = dtype(dtype_)
self._shape = shape
# TODO +offset ?
# TODO +strides ?
if order != 'C':
raise NotImplementedError('Order %s not supported' % order)
# shape, dtype -> ._stridev
# TODO take dtype.alignment into account ?
# NOTE (1,) so that multiply.reduce return 1 (not 1.0) for []
self._stridev = tuple( multiply.reduce((1,) + shape[i+1:]) * self._dtype.itemsize \
for i in range(len(shape)) )
# ~~~ ndarray-like attributes
@property
def data(self):
raise TypeError("Direct access to data for BigArray is forbidden")
@property
def strides(self):
return self._stridev
@property
def dtype(self):
# TODO support assigning new dtype
return self._dtype
@property
def shape(self):
# TODO support assigning new shape
return self._shape
@property
def size(self):
return multiply.reduce(self._shape)
def __len__(self):
# lengths of the first axis
return self._shape[0] # FIXME valid only for C-order
@property
def itemsize(self):
return self._dtype.itemsize
@property
def nbytes(self):
return self.itemsize * self.size
@property
def ndim(self):
return len(self._shape)
def view(self, dtype=None, type=None):
raise NotImplementedError # TODO
# TODO more ndarray-like attributes
# .T
# .flags <--
# .flat
# .imag
# .real
# .base
# ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM.
# only basic indexing is supported - see numpy/.../arrays.indexing.rst
#
# NOTE it would be good if we could reuse prepare_index() &
# npy_index_info from numpy/mapping.[ch]
# access to mapping via property, so that children could hook into it
# (e.g. ZBigArray creates mapping lazily on 1st access)
@property
def _fileh(self):
return self._v_fileh
def __getitem__(self, idx):
# NOTE basic indexing means idx = tuple(slice | int) + sugar(newaxis, ellipsis)
#print('\n__getitem__', idx)
# handle 1d slices uniformly with Nd
if not isinstance(idx, tuple):
idx = (idx,)
idx = list(idx)
# expand ellipsis
try:
ellidx = idx.index(Ellipsis)
except ValueError:
# no ellipsis - nothing to do
pass
else:
# ellipsis present - check there is only 1
if idx[ellidx+1,:].count(Ellipsis):
raise IndexError('multiple ellipsis not allowed')
# and expand with `:,:,...` in place of ellipsis
# (no need to check nexpand < 0 -- [...]*-1 = []
nexpand = len(self.shape) - (len(idx) - idx.count(newaxis) - 1)
idx[ellidx:ellidx] = [slice(None)] * nexpand
#print('...\t->', idx)
# expand idx with : to match shape
# (no need to check for expanding e.g. -1 times -- [...]*-1 = []
idx.extend( [slice(None)] * (len(self.shape) - len(idx) - idx.count(newaxis)) )
#print('expand\t->', idx)
# 1) for newaxis - remember we'll need to increase dimensionality
# there after we take view
#
# 2) for scalars - convert `i -> i:i+1` and remember we'll need to reduce
# dimensionality at that position
dim_adjust = [slice(None)] * len(idx) # :,:,:,...
for i in range(len(idx)):
if idx[i] is newaxis:
dim_adjust[i] = newaxis # [newaxis] will increase ndim
elif not isinstance(idx[i], slice):
_ = idx[i]
if _ < 0:
_ = self.shape[i] + _ # -1 -> N-1 (or else -1:-1+1 -> -1:0 = empty)
idx[i] = slice(_, _+1)
dim_adjust[i] = 0 # [0] will reduce ndim
# if it stays list, and all elements are int, numpy activates advanced indexing
dim_adjust = tuple(dim_adjust)
#print('scalars\t->', idx)
#print('dim_adj\t->', dim_adjust)
# filter-out newaxis from index - so we first work with concrete positions
try:
# XXX not optimal
while 1:
idx.remove(newaxis)
except ValueError:
# no more newaxis - ok
pass
#print('ønewax\t->', idx)
# ensure there are no more indices than we can serve
if len(idx) > len(self.shape):
raise IndexError('too many indices')
# above we cared to expand to shape, if needed
assert len(idx) == len(self.shape)
# now we have:
# - idx and shape are of the same size
# - idx contains only slice objects
# - dim_adjust was prepared for taking scalar and newaxis indices into
# account after we take ndarray view
# major index / stride
# FIXME assumes C ordering
idx0 = idx[0]
stride0 = self._stridev[0]
shape0 = self._shape[0]
# major idx start/stop/stride
idx0_start, idx0_stop, idx0_stride = idx0.indices(shape0)
#print('idx0:\t', idx0, '-> [%s:%s:%s]' % (idx0_start, idx0_stop, idx0_stride))
#print('strid0:\t', stride0) #, self._stridev
#print('shape0:\t', shape0) #, self._shape
# nitems in major row
nitems0 = (idx0_stop - idx0_start - sign(idx0_stride)) // idx0_stride + 1
#print('nitem0:\t', nitems0)
# if major row is "empty" slice, we can return right away without creating vma.
# e.g. 10:5:1, 5:10:-1, 5:5, size+100:size+200 -> []
if nitems0 <= 0:
return ndarray((0,) + self._shape[1:], self._dtype)
# major slice -> in bytes
byte0_start = idx0_start * stride0
byte0_stop = idx0_stop * stride0
byte0_stride = idx0_stride * stride0
# major slice -> in file pages, always increasing, inclusive
page0_min = min(byte0_start, byte0_stop+byte0_stride) // pagesize # TODO -> fileh.pagesize
page0_max = max(byte0_stop-byte0_stride, byte0_start) // pagesize # TODO -> fileh.pagesize
# ~~~ mmap file part corresponding to full major slice into memory
vma0 = self._fileh.mmap(page0_min, page0_max-page0_min+1)
# first get ndarray view with only major slice specified and rest indices being ":"
view0_shape = (nitems0,) + self._shape[1:]
view0_offset = byte0_start - page0_min * pagesize # TODO -> fileh.pagesize
view0_stridev = (byte0_stride,) + self._stridev[1:]
#print('view0_shape:\t', view0_shape, self.shape)
#print('view0_offset:\t', view0_offset)
#print('len(vma0):\t', len(vma0))
view0 = ndarray(view0_shape, self._dtype, vma0, view0_offset, view0_stridev)
# now take into accont indices after major one
view = view0[(slice(None),) + tuple(idx[1:])]
#print('view0:\t', view0.shape)
#print('view:\t', view.shape)
#print('View:\t', view)
#print('view/d:\t', view[dim_adjust])
# and finally take dimenstions adjust into account and we are done
return view[dim_adjust]
def __setitem__(self, idx, v):
# TODO idx = int, i.e. scalar assign
# reprsent changed area by ndarray via getitem, then leverage ndarray assignment
a = self.__getitem__(idx)
a[:] = v
# XXX __array__(self) = self[:] ?
# (for numpy functions to accept bigaray as-is (if size permits))
../../bigfile/tests/__init__.py
\ No newline at end of file
# Wendeling.core.bigarray | Basic tests
# Copyright (C) 2014-2015 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Open Source Initiative approved licenses and Convey
# the resulting work. Corresponding source of such a combination shall include
# the source code for all other software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
from wendelin.bigarray import BigArray
from wendelin.bigfile import BigFile
from wendelin.lib.mem import memcpy
from numpy import ndarray, dtype, int32, uint32, uint8, all, arange, multiply, array_equal
from pytest import raises
# Synthetic bigfile that just loads zeros, and ignores writes (= a-la /dev/zero)
class BigFile_Zero(BigFile):
def loadblk(self, blk, buf):
# Nothing to do here - the memory buf obtained from OS comes pre-cleared
# XXX reenable once/if memory comes uninitialized here
return
def storeblk(self, blk, buf):
return
PS = 2*1024*1024 # FIXME hardcoded, TODO -> ram.pagesize
# basic ndarray-compatibility attributes of BigArray
def test_bigarray_basic():
Z = BigFile_Zero(PS)
Zh = Z.fileh_open()
A = BigArray((10,3), int32, Zh)
raises(TypeError, "A.data")
assert A.strides == (12, 4)
assert A.dtype == dtype(int32)
# XXX .flags?
# XXX .flat? (non-basic)
# XXX .imag? (non-basic)
# XXX .real? (non-basic)
assert A.size == 10*3
assert len(A) == 10
assert A.itemsize == 4
assert A.nbytes == 4*10*3
assert A.ndim == 2
assert A.shape == (10,3)
# XXX .ctypes (non-basic)
# TODO .base
# DoubleGet(obj1, obj2)[key] -> obj1[key], obj2[key]
class DoubleGet:
def __init__(self, obj1, obj2):
self.obj1 = obj1
self.obj2 = obj2
def __getitem__(self, key):
return self.obj1[key], self.obj2[key]
# getitem/setitem (1d case)
def test_bigarray_indexing_1d():
Z = BigFile_Zero(PS)
Zh = Z.fileh_open()
A = BigArray((10*PS,), uint8, Zh)
# ndarray of the same shape - we'll use it to get slices and compare result
# shape/stride against BigArray.__getitem__
A_= ndarray ((10*PS,), uint8)
# AA[key] -> A[key], A_[key]
AA = DoubleGet(A, A_)
# "empty" slices
assert A[10:5:1] .size == 0
assert A[5:10:-1] .size == 0
assert A[5:5] .size == 0
assert A[100*PS:200*PS] .size == 0
# whole array
a, _ = AA[:]
assert isinstance(a, ndarray)
assert a.dtype == dtype(uint8)
assert a.shape == _.shape
assert a.strides == _.strides
assert a[0] == 0
assert a[5*PS] == 0
assert a[10*PS-1] == 0
# overlaps with a
b, _ = AA[4*PS:]
assert isinstance(b, ndarray)
assert b.dtype == dtype(uint8)
assert b.shape == _.shape
assert b.strides == _.strides
assert b[0] == 0
assert b[1*PS] == 0
assert b[5*PS-1] == 0
# a <-> b
assert b[1*PS] == 0
a[5*PS] = 1
assert b[1*PS] == 1
# non-pagesize aligned slice
c, _ = AA[4*PS+3 : 9*PS-3]
assert isinstance(c, ndarray)
assert c.dtype == dtype(uint8)
assert c.shape == _.shape
assert c.strides == _.strides
assert c[0] == 0
assert c[-1] == 0
# a <-> b <-> c
assert b[3] == 0
assert a[4*PS+3] == 0
c[0] = 3
assert b[3] == 3
assert a[4*PS+3] == 3
assert b[5*PS-4] == 0
assert a[9*PS-4] == 0
c[-1] = 99
assert b[5*PS-4] == 99
assert a[9*PS-4] == 99
# negative stride
d, _ = AA[9*PS+1:4*PS-1:-1]
assert isinstance(d, ndarray)
assert d.dtype == dtype(uint8)
assert d.shape == _.shape
assert d.strides == _.strides
assert all(d[:5] == 0)
assert d[5] == 99 # c[-1]
assert all(d[6:-(PS+1)] == 0)
assert d[-(PS+1)] == 1 # a[5*PS]
assert all(d[-PS:-4] == 0)
assert d[-4] == 3 # c[0]
assert all(d[-3:] == 0)
# like c, but stride > 1
e, _ = AA [4*PS+3 : 9*PS-3 : 7]
assert isinstance(e, ndarray)
assert e.dtype == dtype(uint8)
assert e.shape == _.shape
assert e.strides == _.strides
c[0] = 4
assert e[0] == c[0]
c[0] = 5
assert e[0] == c[0]
c[7] = 7
assert e[1] == c[7]
c[7] = 8
assert e[1] == c[7]
# TODO check more
# like d, but stride < -1
f, _ = AA[9*PS+1:4*PS-1:-11]
assert isinstance(f, ndarray)
assert f.dtype == dtype(uint8)
assert f.shape == _.shape
assert f.strides == _.strides
d[0] = 11
assert f[0] == d[0]
d[0] = 12
assert f[0] == d[0]
d[11] = 13
assert f[1] == d[11]
d[11] = 14
assert f[1] == d[11]
# TODO check more
# setitem
A[2*PS+1:3*PS+2] = 5
assert all(a[2*PS+1 : 3*PS+2] == 5)
assert a[2*PS] == 0
assert a[3*PS+3] == 0
A[2*PS+2:2*PS+5] = [6,7,8]
assert a[2*PS+0] == 0
assert a[2*PS+1] == 5
assert a[2*PS+2] == 6
assert a[2*PS+3] == 7
assert a[2*PS+4] == 8
assert a[2*PS+5] == 5
assert a[2*PS+6] == 5
assert raises(ValueError, 'A[:4] = range(5)')
# given dimension length n, yield index variants to test
def indices_to_test(n):
# ":"
yield slice(None)
# int
yield 0
yield -1
yield n//2
# start:stop:stride
yield slice(1,-1)
yield slice(n//4+1, n*3//4-1, 2)
yield slice(n//5+1, n*4//5-1, 3)
# geven shape, yield all Nd idx variant, where every index iterates full indices_to_test
def idx_to_test(shape, idx_prefix=()):
leaf = len(shape) <= 1
for i in indices_to_test(shape[0]):
idx = idx_prefix + (i,)
if leaf:
yield idx
else:
# = yield from
for _ in idx_to_test(shape[1:], idx):
yield _
# getitem/setitem (Nd case)
def test_bigarray_indexing_Nd():
# shape of tested array - all primes, total size for uint32 ~ 7 2M pages
# XXX even less dimensions (to speed up tests)?
shape = tuple(reversed( (17,23,101,103) ))
# test data - all items are unique - so we can check array by content
# NOTE +PS so that BigFile_Data has no problem loading last blk
# (else data slice will be smaller than buf)
data = arange(multiply.reduce(shape) + PS, dtype=uint32)
# synthetic bigfile that loads data from `data`
class BigFile_Data(BigFile):
def loadblk(self, blk, buf):
datab = data.view(uint8)
x = datab[self.blksize * blk : self.blksize * (blk+1)]
memcpy(buf, x)
def storeblk(self, blk, buf):
raise RuntimeError('tests should not try to change test data')
f = BigFile_Data(PS)
fh = f.fileh_open()
A = BigArray(shape, uint32, fh) # bigarray with test data and shape
A_ = data[:multiply.reduce(shape)].reshape(shape) # ndarray ----//----
# now just go over combinations of various slice at each dimension, and see
# whether slicing result is the same ndarray would do.
for idx in idx_to_test(shape):
a = A [idx]
a_ = A_[idx]
assert array_equal(a, a_)
# TODO ... -> expanded (0,1,2,negative), rejected if many
# TODO newaxis
# TODO nidx < len(shape)
# TODO empty slice in major row, empty slice in secondary row
"""
# ellipsis - take some idx[a:b] and replace it by ...
for ellipsis in range(2): # 0 - no ellipsis
# newaxis - added after at some position(s)
for newaxis in range(3): # 0 - no newaxis
"""
......@@ -182,9 +182,9 @@ setup(
packages = ['wendelin'] + ['wendelin.%s' % _ for _ in
find_packages(exclude='3rdparty')],
install_requires = [
'numpy', # lib/mem
'numpy', # BigArray + its children
# for ZBigFile
# for ZBigFile / ZBigArray
# ( NOTE: ZODB3 3.11 just pulls in latest ZODB _4_, so this way
# specifying ZODB _3_ we allow external requirements to
# specify either to use e.g. ZODB3.10 or ZODB4 )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment