From c50503cde8bb8e33424edd0350595d56a363ff0e Mon Sep 17 00:00:00 2001 From: jwittenbach Date: Fri, 6 Jan 2017 12:55:14 -0500 Subject: [PATCH 1/4] removes BoltArrayLocal and reorganizes directory structure --- bolt/__init__.py | 2 +- bolt/base.py | 158 ----- bolt/construct.py | 12 - bolt/factory.py | 83 --- bolt/local/__init__.py | 0 bolt/local/array.py | 255 ------- bolt/local/construct.py | 105 --- bolt/spark/__init__.py | 0 bolt/spark/array.py | 1027 --------------------------- bolt/spark/chunk.py | 677 ------------------ bolt/spark/construct.py | 222 ------ bolt/spark/shapes.py | 167 ----- bolt/spark/stack.py | 154 ---- bolt/spark/statcounter.py | 130 ---- bolt/spark/utils.py | 31 - test/generic.py | 8 +- test/local/test_local_basic.py | 44 -- test/local/test_local_construct.py | 38 - test/local/test_local_functional.py | 52 -- test/spark/test_spark_basic.py | 159 ----- test/spark/test_spark_chunking.py | 208 ------ test/spark/test_spark_construct.py | 96 --- test/spark/test_spark_functional.py | 118 --- test/spark/test_spark_getting.py | 170 ----- test/spark/test_spark_shaping.py | 247 ------- test/spark/test_spark_stacking.py | 133 ---- 26 files changed, 5 insertions(+), 4291 deletions(-) delete mode 100644 bolt/base.py delete mode 100644 bolt/construct.py delete mode 100644 bolt/factory.py delete mode 100644 bolt/local/__init__.py delete mode 100644 bolt/local/array.py delete mode 100644 bolt/local/construct.py delete mode 100644 bolt/spark/__init__.py delete mode 100644 bolt/spark/array.py delete mode 100644 bolt/spark/chunk.py delete mode 100644 bolt/spark/construct.py delete mode 100644 bolt/spark/shapes.py delete mode 100644 bolt/spark/stack.py delete mode 100644 bolt/spark/statcounter.py delete mode 100644 bolt/spark/utils.py delete mode 100644 test/local/test_local_basic.py delete mode 100644 test/local/test_local_construct.py delete mode 100644 test/local/test_local_functional.py delete mode 100644 test/spark/test_spark_basic.py delete mode 100644 test/spark/test_spark_chunking.py delete mode 100644 test/spark/test_spark_construct.py delete mode 100644 test/spark/test_spark_functional.py delete mode 100644 test/spark/test_spark_getting.py delete mode 100644 test/spark/test_spark_shaping.py delete mode 100644 test/spark/test_spark_stacking.py diff --git a/bolt/__init__.py b/bolt/__init__.py index ef75cc6..9892b82 100644 --- a/bolt/__init__.py +++ b/bolt/__init__.py @@ -1,3 +1,3 @@ -from bolt.factory import array, ones, zeros, concatenate +from bolt.array.construct import array, ones, zeros, concatenate __version__ = '0.7.1' diff --git a/bolt/base.py b/bolt/base.py deleted file mode 100644 index 240d926..0000000 --- a/bolt/base.py +++ /dev/null @@ -1,158 +0,0 @@ -class BoltArray(object): - - _mode = None - _metadata = {} - - def __finalize__(self, other): - if isinstance(other, BoltArray): - for name in self._metadata: - other_attr = getattr(other, name, None) - if (other_attr is not self._metadata[name]) \ - and (getattr(self, name, None) is self._metadata[name]): - object.__setattr__(self, name, other_attr) - return self - - @property - def mode(self): - return self._mode - - @property - def shape(self): - """ - Size of each dimension. - """ - raise NotImplementedError - - @property - def size(self): - """ - Total number of elements. - """ - raise NotImplementedError - - @property - def ndim(self): - """ - Number of dimensions. - """ - raise NotImplementedError - - @property - def dtype(self): - """ - Data-type of array. - """ - raise NotImplementedError - - @property - def _constructor(self): - return None - - def sum(self, axis): - """ - Return the sum of the array elements over the given axis. - """ - raise NotImplementedError - - def mean(self, axis): - """ - Return the mean of the array elements over the given axis. - """ - raise NotImplementedError - - def var(self, axis): - """ - Return the variance of the array elements over the given axis. - """ - raise NotImplementedError - - def std(self, axis): - """ - Return the standard deviation of the array elements over the given axis. - """ - raise NotImplementedError - - def min(self, axis): - """ - Return the minimum of the array elements over the given axis or axes. - """ - raise NotImplementedError - - def max(self, axis): - """ - Return the maximum of the array elements over the given axis or axes. - """ - raise NotImplementedError - - def concatenate(self, arry, axis): - raise NotImplementedError - - def transpose(self, axis): - """ - Return an array with the axes transposed. - """ - raise NotImplementedError - - @property - def T(self): - """ - Transpose by reversing the order of the axes. - """ - raise NotImplementedError - - def reshape(self, axis): - """ - Return an array with the same data but a new shape. - """ - raise NotImplementedError - - def squeeze(self, axis): - """ - Remove one or more single-dimensional axes from the array. - """ - raise NotImplementedError - - def swapaxes(self, axis1, axis2): - """ - Return an array with two axes interchanged. - """ - raise NotImplementedError - - def astype(self, dtype, casting): - """ - Cast the array to a specified type. - """ - raise NotImplementedError - - def __getitem__(self, index): - raise NotImplementedError - - def map(self, func, axis): - """ - Apply a function across one or more axes. - """ - raise NotImplementedError - - def reduce(self, func, axis, keepdims): - """ - Reduce an array across one or more axes. - """ - raise NotImplementedError - - def filter(self, func, axis): - """ - Filter an array across one or more axes. - """ - raise NotImplementedError - - def first(self): - """ - Return the first element of the array - """ - raise NotImplementedError - - def __repr__(self): - s = "BoltArray\n" - s += "mode: %s\n" % self._mode - s += "shape: %s\n" % str(self.shape) - return s diff --git a/bolt/construct.py b/bolt/construct.py deleted file mode 100644 index af865e3..0000000 --- a/bolt/construct.py +++ /dev/null @@ -1,12 +0,0 @@ -class ConstructBase(object): - - @classmethod - def dispatch(cls, method, *args, **kwargs): - if method in cls.__dict__: - return cls.__dict__[method].__func__(*args, **kwargs) - else: - raise NotImplementedError("Method %s not implemented on %s" % (method, cls.__name__)) - - @staticmethod - def _argcheck(*args, **kwargs): - return False diff --git a/bolt/factory.py b/bolt/factory.py deleted file mode 100644 index e8e781b..0000000 --- a/bolt/factory.py +++ /dev/null @@ -1,83 +0,0 @@ -from bolt.local.construct import ConstructLocal -from bolt.spark.construct import ConstructSpark - -constructors = [ - ('local', ConstructLocal), - ('spark', ConstructSpark) -] - -def wrapped(f): - """ - Decorator to append routed docstrings - """ - import inspect - - def extract(func): - append = "" - args = inspect.getargspec(func) - for i, a in enumerate(args.args): - if i < (len(args) - len(args.defaults)): - append += str(a) + ", " - else: - default = args.defaults[i-len(args.defaults)] - if hasattr(default, "__name__"): - default = default.__name__ - else: - default = str(default) - append += str(a) + "=" + default + ", " - append = append[:-2] + ")" - return append - - doc = f.__doc__ + "\n" - doc += " local -> array(" + extract(getattr(ConstructLocal, f.__name__)) + "\n" - doc += " spark -> array(" + extract(getattr(ConstructSpark, f.__name__)) + "\n" - f.__doc__ = doc - return f - -def lookup(*args, **kwargs): - """ - Use arguments to route constructor. - - Applies a series of checks on arguments to identify constructor, - starting with known keyword arguments, and then applying - constructor-specific checks - """ - if 'mode' in kwargs: - mode = kwargs['mode'] - if mode not in constructors: - raise ValueError('Mode %s not supported' % mode) - del kwargs['mode'] - return constructors[mode] - else: - for mode, constructor in constructors: - if constructor._argcheck(*args, **kwargs): - return constructor - return ConstructLocal - -@wrapped -def array(*args, **kwargs): - """ - Create a bolt array. - """ - return lookup(*args, **kwargs).dispatch('array', *args, **kwargs) - -@wrapped -def ones(*args, **kwargs): - """ - Create a bolt array of ones. - """ - return lookup(*args, **kwargs).dispatch('ones', *args, **kwargs) - -@wrapped -def zeros(*args, **kwargs): - """ - Create a bolt array of zeros. - """ - return lookup(*args, **kwargs).dispatch('zeros', *args, **kwargs) - -@wrapped -def concatenate(*args, **kwargs): - """ - Create a bolt array of ones. - """ - return lookup(*args, **kwargs).dispatch('concatenate', *args, **kwargs) \ No newline at end of file diff --git a/bolt/local/__init__.py b/bolt/local/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bolt/local/array.py b/bolt/local/array.py deleted file mode 100644 index daf0eaf..0000000 --- a/bolt/local/array.py +++ /dev/null @@ -1,255 +0,0 @@ -from __future__ import print_function -from numpy import ndarray, asarray, ufunc, prod -from bolt.base import BoltArray -from bolt.utils import inshape, tupleize -from functools import reduce - - -class BoltArrayLocal(ndarray, BoltArray): - - def __new__(cls, array): - obj = asarray(array).view(cls) - obj._mode = 'local' - return obj - - def __array_finalize__(self, obj): - if obj is None: - return - self._mode = getattr(obj, 'mode', None) - - def __array_wrap__(self, obj): - if obj.shape == (): - return obj[()] - else: - return ndarray.__array_wrap__(self, obj) - - @property - def _constructor(self): - return BoltArrayLocal - - def _align(self, axes, key_shape=None): - """ - Align local bolt array so that axes for iteration are in the keys. - - This operation is applied before most functional operators. - It ensures that the specified axes are valid, and might transpose/reshape - the underlying array so that the functional operators can be applied - over the correct records. - - Parameters - ---------- - axes: tuple[int] - One or more axes that will be iterated over by a functional operator - - Returns - ------- - BoltArrayLocal - """ - - # ensure that the key axes are valid for an ndarray of this shape - inshape(self.shape, axes) - - # compute the set of dimensions/axes that will be used to reshape - remaining = [dim for dim in range(len(self.shape)) if dim not in axes] - key_shape = key_shape if key_shape else [self.shape[axis] for axis in axes] - remaining_shape = [self.shape[axis] for axis in remaining] - linearized_shape = [prod(key_shape)] + remaining_shape - - # compute the transpose permutation - transpose_order = axes + remaining - - # transpose the array so that the keys being mapped over come first, then linearize keys - reshaped = self.transpose(*transpose_order).reshape(*linearized_shape) - - return reshaped - - def filter(self, func, axis=(0,)): - """ - Filter array along an axis. - - Applies a function which should evaluate to boolean, - along a single axis or multiple axes. Array will be - aligned so that the desired set of axes are in the - keys, which may require a transpose/reshape. - - Parameters - ---------- - func : function - Function to apply, should return boolean - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to filter along. - - Returns - ------- - BoltArrayLocal - """ - axes = sorted(tupleize(axis)) - reshaped = self._align(axes) - - filtered = asarray(list(filter(func, reshaped))) - - return self._constructor(filtered) - - def map(self, func, axis=(0,)): - """ - Apply a function across an axis. - - Array will be aligned so that the desired set of axes - are in the keys, which may require a transpose/reshape. - - Parameters - ---------- - func : function - Function of a single array to apply - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to apply function along. - - Returns - ------- - BoltArrayLocal - """ - axes = sorted(tupleize(axis)) - key_shape = [self.shape[axis] for axis in axes] - reshaped = self._align(axes, key_shape=key_shape) - - mapped = asarray(list(map(func, reshaped))) - elem_shape = mapped[0].shape - - # invert the previous reshape operation, using the shape of the map result - linearized_shape_inv = key_shape + list(elem_shape) - reordered = mapped.reshape(*linearized_shape_inv) - - return self._constructor(reordered) - - def reduce(self, func, axis=0): - """ - Reduce an array along an axis. - - Applies an associative/commutative function of two arguments - cumulatively to all arrays along an axis. Array will be aligned - so that the desired set of axes are in the keys, which may - require a transpose/reshape. - - Parameters - ---------- - func : function - Function of two arrays that returns a single array - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to reduce along. - - Returns - ------- - BoltArrayLocal - """ - axes = sorted(tupleize(axis)) - - # if the function is a ufunc, it can automatically handle reducing over multiple axes - if isinstance(func, ufunc): - inshape(self.shape, axes) - reduced = func.reduce(self, axis=tuple(axes)) - else: - reshaped = self._align(axes) - reduced = reduce(func, reshaped) - - new_array = self._constructor(reduced) - - # ensure that the shape of the reduced array is valid - expected_shape = [self.shape[i] for i in range(len(self.shape)) if i not in axes] - if new_array.shape != tuple(expected_shape): - raise ValueError("reduce did not yield a BoltArray with valid dimensions") - - return new_array - - def first(self): - """ - Return first element of the array - """ - return self[0] - - def concatenate(self, arry, axis=0): - """ - Join this array with another array. - - Paramters - --------- - arry : ndarray or BoltArrayLocal - Another array to concatenate with - - axis : int, optional, default=0 - The axis along which arrays will be joined. - - Returns - ------- - BoltArrayLocal - """ - if isinstance(arry, ndarray): - from bolt import concatenate - return concatenate((self, arry), axis) - else: - raise ValueError("other must be local array, got %s" % type(arry)) - - def toscalar(self): - """ - Returns the single scalar element contained in an array of shape (), if - the array has that shape. Returns self otherwise. - """ - if self.shape == (): - return self.toarray().reshape(1)[0] - else: - return self - - def tospark(self, sc, axis=0): - """ - Converts a BoltArrayLocal into a BoltArraySpark - - Parameters - ---------- - sc : SparkContext - The SparkContext which will be used to create the BoltArraySpark - - axis : tuple or int, optional, default=0 - The axis (or axes) across which this array will be parallelized - - Returns - ------- - BoltArraySpark - """ - from bolt import array - return array(self.toarray(), sc, axis=axis) - - def tordd(self, sc, axis=0): - """ - Converts a BoltArrayLocal into an RDD - - Parameters - ---------- - sc : SparkContext - The SparkContext which will be used to create the BoltArraySpark - - axis : tuple or int, optional, default=0 - The axis (or axes) across which this array will be parallelized - - Returns - ------- - RDD[(tuple, ndarray)] - """ - from bolt import array - return array(self.toarray(), sc, axis=axis).tordd() - - def toarray(self): - """ - Returns the underlying ndarray wrapped by this BoltArrayLocal - """ - return asarray(self) - - def display(self): - """ - Show a pretty-printed representation of this BoltArrayLocal - """ - print(str(self)) - - def __repr__(self): - return BoltArray.__repr__(self) diff --git a/bolt/local/construct.py b/bolt/local/construct.py deleted file mode 100644 index 41ae035..0000000 --- a/bolt/local/construct.py +++ /dev/null @@ -1,105 +0,0 @@ -from numpy import float64, asarray - -from bolt.construct import ConstructBase -from bolt.local.array import BoltArrayLocal - - -class ConstructLocal(ConstructBase): - - @staticmethod - def array(a, dtype=None, order='C'): - """ - Create a local bolt array. - - Parameters - ---------- - a : array-like - An array, any object exposing the array interface, an - object whose __array__ method returns an array, or any - (nested) sequence. - - dtype : data-type, optional, default=None - The desired data-type for the array. If None, will - be determined from the data. (see numpy) - - order : {'C', 'F', 'A'}, optional, default='C' - The order of the array. (see numpy) - - Returns - ------- - BoltArrayLocal - """ - return BoltArrayLocal(asarray(a, dtype, order)) - - @staticmethod - def ones(shape, dtype=float64, order='C'): - """ - Create a local bolt array of ones. - - Parameters - ---------- - shape : tuple - Dimensions of the desired array - - dtype : data-type, optional, default=float64 - The desired data-type for the array. (see numpy) - - order : {'C', 'F', 'A'}, optional, default='C' - The order of the array. (see numpy) - - Returns - ------- - BoltArrayLocal - """ - from numpy import ones - return ConstructLocal._wrap(ones, shape, dtype, order) - - @staticmethod - def zeros(shape, dtype=float64, order='C'): - """ - Create a local bolt array of zeros. - - Parameters - ---------- - shape : tuple - Dimensions of the desired array. - - dtype : data-type, optional, default=float64 - The desired data-type for the array. (see numpy) - - order : {'C', 'F', 'A'}, optional, default='C' - The order of the array. (see numpy) - - Returns - ------- - BoltArrayLocal - """ - from numpy import zeros - return ConstructLocal._wrap(zeros, shape, dtype, order) - - @staticmethod - def _wrap(func, shape, dtype, order): - return BoltArrayLocal(func(shape, dtype, order)) - - @staticmethod - def concatenate(arrays, axis=0): - """ - Join a sequence of arrays together. - - Parameters - ---------- - arrays : tuple - A sequence of array-like e.g. (a1, a2, ...) - - axis : int, optional, default=0 - The axis along which the arrays will be joined. - - Returns - ------- - BoltArrayLocal - """ - if not isinstance(arrays, tuple): - raise ValueError("data type not understood") - arrays = tuple([asarray(a) for a in arrays]) - from numpy import concatenate - return BoltArrayLocal(concatenate(arrays, axis)) \ No newline at end of file diff --git a/bolt/spark/__init__.py b/bolt/spark/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bolt/spark/array.py b/bolt/spark/array.py deleted file mode 100644 index 0bc3cbb..0000000 --- a/bolt/spark/array.py +++ /dev/null @@ -1,1027 +0,0 @@ -from __future__ import print_function -from numpy import asarray, unravel_index, prod, mod, ndarray, ceil, where, \ - r_, sort, argsort, array, random, arange, ones, expand_dims, sum -from itertools import groupby - -from bolt.base import BoltArray -from bolt.spark.stack import StackedArray -from bolt.spark.utils import zip_with_index -from bolt.spark.statcounter import StatCounter -from bolt.utils import slicify, listify, tupleize, argpack, inshape, istransposeable, isreshapeable - - -class BoltArraySpark(BoltArray): - - _metadata = { - '_shape': None, - '_split': None, - '_dtype': None, - '_ordered': True - } - - def __init__(self, rdd, shape=None, split=None, dtype=None, ordered=True): - self._rdd = rdd - self._shape = shape - self._split = split - self._dtype = dtype - self._mode = 'spark' - self._ordered = ordered - - @property - def _constructor(self): - return BoltArraySpark - - def __array__(self): - return self.toarray() - - def cache(self): - """ - Cache the underlying RDD in memory. - """ - self._rdd.cache() - - def unpersist(self): - """ - Remove the underlying RDD from memory. - """ - self._rdd.unpersist() - - def repartition(self, npartitions): - """ - Repartitions the underlying RDD - - Parameters - ---------- - npartitions : int - Number of partitions to repartion the underlying RDD to - """ - - rdd = self._rdd.repartition(npartitions) - return self._constructor(rdd, ordered=False).__finalize__(self) - - def stack(self, size=None): - """ - Aggregates records of a distributed array. - - Stacking should improve the performance of vectorized operations, - but the resulting StackedArray object only exposes a restricted set - of operations (e.g. map, reduce). The unstack method can be used - to restore the full bolt array. - - Parameters - ---------- - size : int, optional, default=None - The maximum size for each stack (number of original records), - will aggregate groups of records per partition up to this size, - if None will aggregate all records on each partition. - - Returns - ------- - StackedArray - """ - stk = StackedArray(self._rdd, shape=self.shape, split=self.split) - return stk.stack(size) - - def _align(self, axis): - """ - Align spark bolt array so that axes for iteration are in the keys. - - This operation is applied before most functional operators. - It ensures that the specified axes are valid, and swaps - key/value axes so that functional operators can be applied - over the correct records. - - Parameters - ---------- - axis: tuple[int] - One or more axes that wil be iterated over by a functional operator - - Returns - ------- - BoltArraySpark - """ - # ensure that the specified axes are valid - inshape(self.shape, axis) - - # find the value axes that should be moved into the keys (axis >= split) - tokeys = [(a - self.split) for a in axis if a >= self.split] - - # find the key axes that should be moved into the values (axis < split) - tovalues = [a for a in range(self.split) if a not in axis] - - if tokeys or tovalues: - return self.swap(tovalues, tokeys) - else: - return self - - def first(self): - """ - Return the first element of an array - """ - from bolt.local.array import BoltArrayLocal - rdd = self._rdd if self._ordered else self._rdd.sortByKey() - return BoltArrayLocal(rdd.values().first()) - - def map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False): - """ - Apply a function across an axis. - - Array will be aligned so that the desired set of axes - are in the keys, which may incur a swap. - - Parameters - ---------- - func : function - Function of a single array to apply. If with_keys=True, - function should be of a (tuple, array) pair. - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to apply function along. - - value_shape : tuple, optional, default=None - Known shape of values resulting from operation - - dtype: numpy.dtype, optional, default=None - Known dtype of values resulting from operation - - with_keys : bool, optional, default=False - Include keys as an argument to the function - - Returns - ------- - BoltArraySpark - """ - axis = tupleize(axis) - swapped = self._align(axis) - - if with_keys: - test_func = lambda x: func(((0,), x)) - else: - test_func = func - - if value_shape is None or dtype is None: - # try to compute the size of each mapped element by applying func to a random array - try: - mapped = test_func(random.randn(*swapped.values.shape).astype(self.dtype)) - except Exception: - first = swapped._rdd.first() - if first: - # eval func on the first element - mapped = test_func(first[1]) - if value_shape is None: - value_shape = mapped.shape - if dtype is None: - dtype = mapped.dtype - - shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape) - - if with_keys: - rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv))) - else: - rdd = swapped._rdd.mapValues(func) - - # reshaping will fail if the elements aren't uniformly shaped - def check(v): - if len(v.shape) > 0 and v.shape != tupleize(value_shape): - raise Exception("Map operation did not produce values of uniform shape.") - return v - - rdd = rdd.mapValues(lambda v: check(v)) - - return self._constructor(rdd, shape=shape, dtype=dtype, split=swapped.split).__finalize__(swapped) - - def filter(self, func, axis=(0,), sort=False): - """ - Filter array along an axis. - - Applies a function which should evaluate to boolean, - along a single axis or multiple axes. Array will be - aligned so that the desired set of axes are in the keys, - which may incur a swap. - - Parameters - ---------- - func : function - Function to apply, should return boolean - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to filter along. - - sort: bool, optional, default=False - Whether or not to sort by key before reindexing - - Returns - ------- - BoltArraySpark - """ - axis = tupleize(axis) - - swapped = self._align(axis) - def f(record): - return func(record[1]) - rdd = swapped._rdd.filter(f) - if sort: - rdd = rdd.sortByKey().values() - else: - rdd = rdd.values() - - # count the resulting array in order to reindex (linearize) the keys - count, zipped = zip_with_index(rdd) - if not count: - count = zipped.count() - reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) - - # since we can only filter over one axis, the remaining shape is always the following - remaining = list(swapped.shape[len(axis):]) - if count != 0: - shape = tuple([count] + remaining) - else: - shape = (0,) - - return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped) - - def reduce(self, func, axis=(0,), keepdims=False): - """ - Reduce an array along an axis. - - Applies a commutative/associative function of two - arguments cumulatively to all arrays along an axis. - Array will be aligned so that the desired set of axes - are in the keys, which may incur a swap. - - Parameters - ---------- - func : function - Function of two arrays that returns a single array - - axis : tuple or int, optional, default=(0,) - Axis or multiple axes to reduce along. - - Returns - ------- - BoltArraySpark - """ - from bolt.local.array import BoltArrayLocal - from numpy import ndarray - - axis = tupleize(axis) - swapped = self._align(axis) - arr = swapped._rdd.values().treeReduce(func, depth=3) - - if keepdims: - for i in axis: - arr = expand_dims(arr, axis=i) - - if not isinstance(arr, ndarray): - # the result of a reduce can also be a scalar - return arr - elif arr.shape == (1,): - # ndarrays with single values in them should be converted into scalars - return arr[0] - - return BoltArrayLocal(arr) - - def _stat(self, axis=None, func=None, name=None, keepdims=False): - """ - Compute a statistic over an axis. - - Can provide either a function (for use in a reduce) - or a name (for use by a stat counter). - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - func : function, optional, default=None - Function for reduce, see BoltArraySpark.reduce - - name : str - A named statistic, see StatCounter - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - if axis is None: - axis = list(range(len(self.shape))) - axis = tupleize(axis) - - if func and not name: - return self.reduce(func, axis, keepdims) - - if name and not func: - from bolt.local.array import BoltArrayLocal - - swapped = self._align(axis) - - def reducer(left, right): - return left.combine(right) - - counter = swapped._rdd.values()\ - .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\ - .treeReduce(reducer, depth=3) - - arr = getattr(counter, name) - - if keepdims: - for i in axis: - arr = expand_dims(arr, axis=i) - - return BoltArrayLocal(arr).toscalar() - - else: - raise ValueError('Must specify either a function or a statistic name.') - - def mean(self, axis=None, keepdims=False): - """ - Return the mean of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - return self._stat(axis, name='mean', keepdims=keepdims) - - def var(self, axis=None, keepdims=False): - """ - Return the variance of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - return self._stat(axis, name='variance', keepdims=keepdims) - - def std(self, axis=None, keepdims=False): - """ - Return the standard deviation of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - return self._stat(axis, name='stdev', keepdims=keepdims) - - def sum(self, axis=None, keepdims=False): - """ - Return the sum of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - from operator import add - return self._stat(axis, func=add, keepdims=keepdims) - - def max(self, axis=None, keepdims=False): - """ - Return the maximum of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - from numpy import maximum - return self._stat(axis, func=maximum, keepdims=keepdims) - - def min(self, axis=None, keepdims=False): - """ - Return the minimum of the array over the given axis. - - Parameters - ---------- - axis : tuple or int, optional, default=None - Axis to compute statistic over, if None - will compute over all axes - - keepdims : boolean, optional, default=False - Keep axis remaining after operation with size 1. - """ - from numpy import minimum - return self._stat(axis, func=minimum, keepdims=keepdims) - - def concatenate(self, arry, axis=0): - """ - Join this array with another array. - - Paramters - --------- - arry : ndarray, BoltArrayLocal, or BoltArraySpark - Another array to concatenate with - - axis : int, optional, default=0 - The axis along which arrays will be joined. - - Returns - ------- - BoltArraySpark - """ - if isinstance(arry, ndarray): - from bolt.spark.construct import ConstructSpark - arry = ConstructSpark.array(arry, self._rdd.context, axis=range(0, self.split)) - else: - if not isinstance(arry, BoltArraySpark): - raise ValueError("other must be local array or spark array, got %s" % type(arry)) - - if not all([x == y if not i == axis else True - for i, (x, y) in enumerate(zip(self.shape, arry.shape))]): - raise ValueError("all the input array dimensions except for " - "the concatenation axis must match exactly") - - if not self.split == arry.split: - raise NotImplementedError("two arrays must have the same split ") - - if axis < self.split: - shape = self.keys.shape - - def key_func(key): - key = list(key) - key[axis] += shape[axis] - return tuple(key) - - rdd = self._rdd.union(arry._rdd.map(lambda kv: (key_func(kv[0]), kv[1]))) - - else: - from numpy import concatenate as npconcatenate - shift = axis - self.split - rdd = self._rdd.join(arry._rdd).map(lambda kv: (kv[0], npconcatenate(kv[1], axis=shift))) - - shape = tuple([x + y if i == axis else x - for i, (x, y) in enumerate(zip(self.shape, arry.shape))]) - - return self._constructor(rdd, shape=shape, ordered=False).__finalize__(self) - - def _getbasic(self, index): - """ - Basic indexing (for slices or ints). - """ - key_slices = index[0:self.split] - value_slices = index[self.split:] - - def key_check(key): - def inrange(k, s): - if s.step > 0: - return s.start <= k < s.stop - else: - return s.stop < k <= s.start - def check(k, s): - return inrange(k, s) and mod(k - s.start, s.step) == 0 - out = [check(k, s) for k, s in zip(key, key_slices)] - return all(out) - - def key_func(key): - return tuple([(k - s.start)/s.step for k, s in zip(key, key_slices)]) - - filtered = self._rdd.filter(lambda kv: key_check(kv[0])) - - if self._split == self.ndim: - rdd = filtered.map(lambda kv: (key_func(kv[0]), kv[1])) - else: - # handle use of use slice.stop = -1 for a special case (see utils.slicify) - value_slices = [s if s.stop != -1 else slice(s.start, None, s.step) for s in value_slices] - rdd = filtered.map(lambda kv: (key_func(kv[0]), kv[1][value_slices])) - - shape = tuple([int(ceil((s.stop - s.start) / float(s.step))) for s in index]) - split = self.split - return rdd, shape, split - - def _getadvanced(self, index): - """ - Advanced indexing (for sets, lists, or ndarrays). - """ - index = [asarray(i) for i in index] - shape = index[0].shape - if not all([i.shape == shape for i in index]): - raise ValueError("shape mismatch: indexing arrays could not be broadcast " - "together with shapes " + ("%s " * self.ndim) - % tuple([i.shape for i in index])) - - index = tuple([listify(i, d) for (i, d) in zip(index, self.shape)]) - - # build tuples with target indices - key_tuples = list(zip(*index[0:self.split])) - value_tuples = list(zip(*index[self.split:])) - - # build dictionary to look up targets in values - d = {} - for k, g in groupby(zip(value_tuples, key_tuples), lambda x: x[1]): - d[k] = map(lambda x: x[0], list(g)) - - def key_check(key): - return key in key_tuples - - def key_func(key): - return unravel_index(key, shape) - - # filter records based on key targets - filtered = self._rdd.filter(lambda kv: key_check(kv[0])) - - # subselect and flatten records based on value targets (if they exist) - if len(value_tuples) > 0: - flattened = filtered.flatMap(lambda kv: [(kv[0], kv[1][i]) for i in d[kv[0]]]) - else: - flattened = filtered - - # reindex - indexed = flattened.zipWithIndex() - rdd = indexed.map(lambda kkv: (key_func(kkv[1]), kkv[0][1])) - split = len(shape) - - return rdd, shape, split - - def _getmixed(self, index): - """ - Mixed indexing (combines basic and advanced indexes) - - Assumes that only a single advanced index is used, due to the complicated - behavior needed to be compatible with NumPy otherwise. - """ - # find the single advanced index - loc = where([isinstance(i, (tuple, list, ndarray)) for i in index])[0][0] - idx = list(index[loc]) - - if isinstance(idx[0], (tuple, list, ndarray)): - raise ValueError("When mixing basic and advanced indexing, " - "advanced index must be one-dimensional") - - # single advanced index is on a key -- filter and update key - if loc < self.split: - def newkey(key): - newkey = list(key) - newkey[loc] = idx.index(key[loc]) - return tuple(newkey) - rdd = self._rdd.filter(lambda kv: kv[0][loc] in idx).map(lambda kv: (newkey(kv[0]), kv[1])) - # single advanced index is on a value -- use NumPy indexing - else: - slices = [slice(0, None, None) for _ in self.values.shape] - slices[loc - self.split] = idx - rdd = self._rdd.map(lambda kv: (kv[0], kv[1][slices])) - newshape = list(self.shape) - newshape[loc] = len(idx) - barray = self._constructor(rdd, shape=tuple(newshape)).__finalize__(self) - - # apply the rest of the simple indices - new_index = index[:] - new_index[loc] = slice(0, None, None) - barray = barray[tuple(new_index)] - return barray._rdd, barray.shape, barray.split - - def __getitem__(self, index): - """ - Get an item from the array through indexing. - - Supports basic indexing with slices and ints, or advanced - indexing with lists or ndarrays of integers. - Mixing basic and advanced indexing across axes is currently supported - only for a single advanced index amidst multiple basic indices. - - Parameters - ---------- - index : tuple of slices, ints, list, tuple, or ndarrays - One or more index specifications - - Returns - ------- - BoltSparkArray - """ - if isinstance(index, tuple): - index = list(index) - else: - index = [index] - int_locs = where([isinstance(i, int) for i in index])[0] - - if len(index) > self.ndim: - raise ValueError("Too many indices for array") - - if not all([isinstance(i, (slice, int, list, tuple, ndarray)) for i in index]): - raise ValueError("Each index must either be a slice, int, list, set, or ndarray") - - # fill unspecified axes with full slices - if len(index) < self.ndim: - index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))]) - - # standardize slices and bounds checking - for n, idx in enumerate(index): - size = self.shape[n] - if isinstance(idx, (slice, int)): - slc = slicify(idx, size) - # throw an error if this would lead to an empty dimension in numpy - if slc.step > 0: - minval, maxval = slc.start, slc.stop - else: - minval, maxval = slc.stop, slc.start - if minval > size-1 or maxval < 1 or minval >= maxval: - raise ValueError("Index {} in dimension {} with shape {} would " - "produce an empty dimension".format(idx, n, size)) - index[n] = slc - else: - adjusted = array(idx) - inds = where(adjusted<0) - adjusted[inds] += size - if adjusted.min() < 0 or adjusted.max() > size-1: - raise ValueError("Index {} out of bounds in dimension {} with " - "shape {}".format(idx, n, size)) - index[n] = adjusted - - # select basic or advanced indexing - if all([isinstance(i, slice) for i in index]): - rdd, shape, split = self._getbasic(index) - elif all([isinstance(i, (tuple, list, ndarray)) for i in index]): - rdd, shape, split = self._getadvanced(index) - elif sum([isinstance(i, (tuple, list, ndarray)) for i in index]) == 1: - rdd, shape, split = self._getmixed(index) - else: - raise NotImplementedError("When mixing basic indexing (slices and int) with " - "with advanced indexing (lists, tuples, and ndarrays), " - "can only have a single advanced index") - - # if any key indices used negative steps, records are no longer ordered - if self._ordered is False or any([isinstance(s, slice) and s.step<0 for s in index[:self.split]]): - ordered = False - else: - ordered = True - - result = self._constructor(rdd, shape=shape, split=split, ordered=ordered).__finalize__(self) - - # squeeze out int dimensions (and squeeze to singletons if all ints) - if len(int_locs) == self.ndim: - return result.squeeze().toarray()[()] - else: - return result.squeeze(tuple(int_locs)) - - def chunk(self, size="150", axis=None, padding=None): - """ - Chunks records of a distributed array. - - Chunking breaks arrays into subarrays, using an specified - size of chunks along each value dimension. Can alternatively - specify an average chunk byte size (in kilobytes) and the size of - chunks (as ints) will be computed automatically. - - Parameters - ---------- - size : tuple, int, or str, optional, default = "150" - A string giving the size in kilobytes, or a tuple with the size - of chunks along each dimension. - - axis : int or tuple, optional, default = None - One or more axis to chunk array along, if None - will use all axes, - - padding: tuple or int, default = None - Number of elements per dimension that will overlap with the adjacent chunk. - If a tuple, specifies padding along each chunked dimension; if a int, same - padding will be applied to all chunked dimensions. - - Returns - ------- - ChunkedArray - """ - if type(size) is not str: - size = tupleize((size)) - axis = tupleize((axis)) - padding = tupleize((padding)) - - from bolt.spark.chunk import ChunkedArray - - chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) - return chnk._chunk(size, axis, padding) - - def swap(self, kaxes, vaxes, size="150"): - """ - Swap axes from keys to values. - - This is the core operation underlying shape manipulation - on the Spark bolt array. It exchanges an arbitrary set of axes - between the keys and the valeus. If either is None, will only - move axes in one direction (from keys to values, or values to keys). - Keys moved to values will be placed immediately after the split; - values moved to keys will be placed immediately before the split. - - Parameters - ---------- - kaxes : tuple - Axes from keys to move to values - - vaxes : tuple - Axes from values to move to keys - - size : tuple or int, optional, default = "150" - Can either provide a string giving the size in kilobytes, - or a tuple with the number of chunks along each - value dimension being moved - - Returns - ------- - BoltArraySpark - """ - kaxes = asarray(tupleize(kaxes), 'int') - vaxes = asarray(tupleize(vaxes), 'int') - if type(size) is not str: - size = tupleize(size) - - if len(kaxes) == self.keys.ndim and len(vaxes) == 0: - raise ValueError('Cannot perform a swap that would ' - 'end up with all data on a single key') - - if len(kaxes) == 0 and len(vaxes) == 0: - return self - - from bolt.spark.chunk import ChunkedArray - - chunks = self.chunk(size) - - swapped = chunks.keys_to_values(kaxes).values_to_keys([v+len(kaxes) for v in vaxes]) - barray = swapped.unchunk() - - return barray - - def transpose(self, *axes): - """ - Return an array with the axes transposed. - - This operation will incur a swap unless the - desiured permutation can be obtained - only by transpoing the keys or the values. - - Parameters - ---------- - axes : None, tuple of ints, or n ints - If None, will reverse axis order. - """ - if len(axes) == 0: - p = arange(self.ndim-1, -1, -1) - else: - p = asarray(argpack(axes)) - - istransposeable(p, range(self.ndim)) - - split = self.split - - # compute the keys/value axes that need to be swapped - new_keys, new_values = p[:split], p[split:] - swapping_keys = sort(new_values[new_values < split]) - swapping_values = sort(new_keys[new_keys >= split]) - stationary_keys = sort(new_keys[new_keys < split]) - stationary_values = sort(new_values[new_values >= split]) - - # compute the permutation that the swap causes - p_swap = r_[stationary_keys, swapping_values, swapping_keys, stationary_values] - - # compute the extra permutation (p_x) on top of this that - # needs to happen to get the full permutation desired - p_swap_inv = argsort(p_swap) - p_x = p_swap_inv[p] - p_keys, p_values = p_x[:split], p_x[split:]-split - - # perform the swap and the the within key/value permutations - arr = self.swap(swapping_keys, swapping_values-split) - arr = arr.keys.transpose(tuple(p_keys.tolist())) - arr = arr.values.transpose(tuple(p_values.tolist())) - - return arr - - @property - def T(self): - """ - Transpose by reversing the order of the axes. - """ - return self.transpose() - - def swapaxes(self, axis1, axis2): - """ - Return the array with two axes interchanged. - - Parameters - ---------- - axis1 : int - The first axis to swap - - axis2 : int - The second axis to swap - """ - p = list(range(self.ndim)) - p[axis1] = axis2 - p[axis2] = axis1 - - return self.transpose(p) - - def reshape(self, *shape): - """ - Return an array with the same data but a new shape. - - Currently only supports reshaping that independently - reshapes the keys, or the values, or both. - - Parameters - ---------- - shape : tuple of ints, or n ints - New shape - """ - new = argpack(shape) - isreshapeable(new, self.shape) - - if new == self.shape: - return self - - i = self._reshapebasic(new) - if i == -1: - raise NotImplementedError("Currently no support for reshaping between " - "keys and values for BoltArraySpark") - else: - new_key_shape, new_value_shape = new[:i], new[i:] - return self.keys.reshape(new_key_shape).values.reshape(new_value_shape) - - def _reshapebasic(self, shape): - """ - Check if the requested reshape can be broken into independant reshapes - on the keys and values. If it can, returns the index in the new shape - separating keys from values, otherwise returns -1 - """ - new = tupleize(shape) - old_key_size = prod(self.keys.shape) - old_value_size = prod(self.values.shape) - - for i in range(len(new)): - new_key_size = prod(new[:i]) - new_value_size = prod(new[i:]) - if new_key_size == old_key_size and new_value_size == old_value_size: - return i - - return -1 - - def squeeze(self, axis=None): - """ - Remove one or more single-dimensional axes from the array. - - Parameters - ---------- - axis : tuple or int - One or more singleton axes to remove. - """ - if not any([d == 1 for d in self.shape]): - return self - - if axis is None: - drop = where(asarray(self.shape) == 1)[0] - elif isinstance(axis, int): - drop = asarray((axis,)) - elif isinstance(axis, tuple): - drop = asarray(axis) - else: - raise ValueError("an integer or tuple is required for the axis") - - if any([self.shape[i] > 1 for i in drop]): - raise ValueError("cannot select an axis to squeeze out which has size greater than one") - - if any(asarray(drop) < self.split): - kmask = set([d for d in drop if d < self.split]) - kfunc = lambda k: tuple([kk for ii, kk in enumerate(k) if ii not in kmask]) - else: - kfunc = lambda k: k - - if any(asarray(drop) >= self.split): - vmask = tuple([d - self.split for d in drop if d >= self.split]) - vfunc = lambda v: v.squeeze(vmask) - else: - vfunc = lambda v: v - - rdd = self._rdd.map(lambda kv: (kfunc(kv[0]), vfunc(kv[1]))) - shape = tuple([ss for ii, ss in enumerate(self.shape) if ii not in drop]) - split = len([d for d in range(self.keys.ndim) if d not in drop]) - return self._constructor(rdd, shape=shape, split=split).__finalize__(self) - - def astype(self, dtype, casting='unsafe'): - """ - Cast the array to a specified type. - - Parameters - ---------- - dtype : str or dtype - Typecode or data-type to cast the array to (see numpy) - """ - rdd = self._rdd.mapValues(lambda v: v.astype(dtype, 'K', casting)) - return self._constructor(rdd, dtype=dtype).__finalize__(self) - - def clip(self, min=None, max=None): - """ - Clip values above and below. - - Parameters - ---------- - min : scalar or array-like - Minimum value. If array, will be broadcasted - - max : scalar or array-like - Maximum value. If array, will be broadcasted. - """ - rdd = self._rdd.mapValues(lambda v: v.clip(min=min, max=max)) - return self._constructor(rdd).__finalize__(self) - - @property - def shape(self): - """ - Size of each dimension. - """ - return self._shape - - @property - def size(self): - """ - Total number of elements. - """ - return prod(self._shape) - - @property - def ndim(self): - """ - Number of dimensions. - """ - return len(self._shape) - - @property - def split(self): - """ - Axis at which the array is split into keys/values. - """ - return self._split - - @property - def dtype(self): - """ - Data-type of array. - """ - return self._dtype - - @property - def mask(self): - return tuple([1] * len(self.keys.shape) + [0] * len(self.values.shape)) - - @property - def keys(self): - """ - Returns a restricted keys. - """ - from bolt.spark.shapes import Keys - return Keys(self) - - @property - def values(self): - from bolt.spark.shapes import Values - return Values(self) - - def tolocal(self): - """ - Returns a local bolt array by first collecting as an array. - """ - from bolt.local.array import BoltArrayLocal - return BoltArrayLocal(self.toarray()) - - def toarray(self): - """ - Returns the contents as a local array. - - Will likely cause memory problems for large objects. - """ - rdd = self._rdd if self._ordered else self._rdd.sortByKey() - x = rdd.values().collect() - return asarray(x).reshape(self.shape) - - def tordd(self): - """ - Return the underlying RDD of the bolt array. - """ - return self._rdd - - def display(self): - """ - Show a pretty-printed representation of this BoltArrayLocal. - """ - for x in self._rdd.take(10): - print(x) diff --git a/bolt/spark/chunk.py b/bolt/spark/chunk.py deleted file mode 100644 index 25ec393..0000000 --- a/bolt/spark/chunk.py +++ /dev/null @@ -1,677 +0,0 @@ -from numpy import zeros, ones, asarray, r_, concatenate, arange, ceil, prod, \ - empty, mod, floor, any, ndarray, amin, amax, array_equal, squeeze, array, \ - where, random, ravel_multi_index - -from itertools import product - -from bolt.utils import tuplesort, tupleize, allstack, iterexpand -from bolt.spark.array import BoltArraySpark - - -class ChunkedArray(object): - """ - Wraps a BoltArraySpark and provides an interface for chunking - into subarrays and performing operations on chunks. Many methods will - be restricted until the chunked array is unchunked. - - The general form supports axis movement during chunking, specifically, - moving axes from keys to values and vice versa. For every - value-dimension that becomes a key, the values are sliced along that - dimension into 'chunks' of a user-specified size. This is an - intermediate form that can be transformed back into a BoltSparkArray. - """ - _metadata = ['_shape', '_split', '_dtype', '_plan', '_padding', '_ordered'] - - def __init__(self, rdd, shape=None, split=None, dtype=None, plan=None, padding=None, ordered=None): - self._rdd = rdd - self._shape = shape - self._split = split - self._dtype = dtype - self._plan = plan - self._padding = padding - self._ordered = ordered - - @property - def dtype(self): - return self._dtype - - @property - def shape(self): - return self._shape - - @property - def split(self): - return self._split - - @property - def plan(self): - return self._plan - - @property - def padding(self): - return self._padding - - @property - def uniform(self): - return all([mod(x, y) == 0 for x, y in zip(self.vshape, self.plan)]) - - @property - def padded(self): - return not all([p == 0 for p in self.padding]) - - @property - def kshape(self): - return asarray(self._shape[:self._split]) - - @property - def vshape(self): - return asarray(self._shape[self._split:]) - - def kmask(self, axes): - return self.getmask(axes, len(self.kshape)) - - def vmask(self, axes): - return self.getmask(axes, len(self.vshape)) - - @property - def _constructor(self): - return ChunkedArray - - def __finalize__(self, other): - for name in self._metadata: - other_attr = getattr(other, name, None) - if (other_attr is not None) and (getattr(self, name, None) is None): - object.__setattr__(self, name, other_attr) - return self - - def _chunk(self, size="150", axis=None, padding=None): - """ - Split values of distributed array into chunks. - - Transforms an underlying pair RDD of (key, value) into - records of the form: (key, chunk id), (chunked value). - Here, chunk id is a tuple identifying the chunk and - chunked value is a subset of the data from each original value, - that has been divided along the specified dimensions. - - Parameters - ---------- - size : str or tuple or int - If str, the average size (in KB) of the chunks in all value dimensions. - If int or tuple, an explicit specification of the number chunks in - each value dimension. - - axis : tuple, optional, default=None - One or more axes to estimate chunks for, if provided any - other axes will use one chunk. - - padding: tuple or int, default = None - Number of elements per dimension that will overlap with the adjacent chunk. - If a tuple, specifies padding along each chunked dimension; if a int, same - padding will be applied to all chunked dimensions. - """ - if self.split == len(self.shape) and padding is None: - self._rdd = self._rdd.map(lambda kv: (kv[0]+(0,), array(kv[1], ndmin=1))) - self._shape = self._shape + (1,) - self._plan = (1,) - self._padding = array([0]) - return self - - rdd = self._rdd - self._plan, self._padding = self.getplan(size, axis, padding) - - if any([x + y > z for x, y, z in zip(self.plan, self.padding, self.vshape)]): - raise ValueError("Chunk sizes %s plus padding sizes %s cannot exceed value dimensions %s along any axis" - % (tuple(self.plan), tuple(self.padding), tuple(self.vshape))) - - if any([x > y for x, y in zip(self.padding, self.plan)]): - raise ValueError("Padding sizes %s cannot exceed chunk sizes %s along any axis" - % (tuple(self.padding), tuple(self.plan))) - - slices = self.getslices(self.plan, self.padding, self.vshape) - labels = list(product(*[list(enumerate(s)) for s in slices])) - scheme = [list(zip(*s)) for s in labels] - - def _chunk(record): - k, v = record[0], record[1] - for (chk, slc) in scheme: - if type(k) is int: - k = (k,) - yield k + chk, v[slc] - - rdd = rdd.flatMap(_chunk) - return self._constructor(rdd, shape=self.shape, split=self.split, - dtype=self.dtype, plan=self.plan, padding=self.padding, ordered=self._ordered) - - def unchunk(self): - """ - Convert a chunked array back into a full array with (key,value) pairs - where key is a tuple of indices, and value is an ndarray. - """ - plan, padding, vshape, split = self.plan, self.padding, self.vshape, self.split - nchunks = self.getnumber(plan, vshape) - full_shape = concatenate((nchunks, plan)) - n = len(vshape) - perm = concatenate(list(zip(range(n), range(n, 2*n)))) - - if self.uniform: - def _unchunk(it): - ordered = sorted(it, key=lambda kv: kv[0][split:]) - keys, values = zip(*ordered) - yield keys[0][:split], asarray(values).reshape(full_shape).transpose(perm).reshape(vshape) - else: - def _unchunk(it): - ordered = sorted(it, key=lambda kv: kv[0][split:]) - keys, values = zip(*ordered) - k_chks = [k[split:] for k in keys] - arr = empty(nchunks, dtype='object') - for (i, d) in zip(k_chks, values): - arr[i] = d - yield keys[0][:split], allstack(arr.tolist()) - - # remove padding - if self.padded: - removepad = self.removepad - rdd = self._rdd.map(lambda kv: (kv[0], removepad(kv[0][split:], kv[1], nchunks, padding, axes=range(n)))) - else: - rdd = self._rdd - - # skip partitionBy if there is not actually any chunking - if array_equal(self.plan, self.vshape): - rdd = rdd.map(lambda kv: (kv[0][:split], kv[1])) - ordered = self._ordered - else: - ranges = self.kshape - npartitions = int(prod(ranges)) - if len(self.kshape) == 0: - partitioner = lambda k: 0 - else: - partitioner = lambda k: ravel_multi_index(k[:split], ranges) - rdd = rdd.partitionBy(numPartitions=npartitions, partitionFunc=partitioner).mapPartitions(_unchunk) - ordered = True - - if array_equal(self.vshape, [1]): - rdd = rdd.mapValues(lambda v: squeeze(v)) - newshape = self.shape[:-1] - else: - newshape = self.shape - - return BoltArraySpark(rdd, shape=newshape, split=self._split, - dtype=self.dtype, ordered=ordered) - - def keys_to_values(self, axes, size=None): - """ - Move indices in the keys into the values. - - Padding on these new value-dimensions is not currently supported and is set to 0. - - Parameters - ---------- - axes : tuple - Axes from keys to move to values. - - size : tuple, optional, default=None - Size of chunks for the values along the new dimensions. - If None, then no chunking for all axes (number of chunks = 1) - - Returns - ------- - ChunkedArray - """ - if len(axes) == 0: - return self - - kmask = self.kmask(axes) - - if size is None: - size = self.kshape[kmask] - - # update properties - newplan = r_[size, self.plan] - newsplit = self._split - len(axes) - newshape = tuple(r_[self.kshape[~kmask], self.kshape[kmask], self.vshape].astype(int).tolist()) - newpadding = r_[zeros(len(axes), dtype=int), self.padding] - - result = self._constructor(None, shape=newshape, split=newsplit, - dtype=self.dtype, plan=newplan, padding=newpadding, ordered=True) - - # convert keys into chunk + within-chunk label - split = self.split - def _relabel(record): - k, data = record - keys, chks = asarray(k[:split], 'int'), k[split:] - movingkeys, stationarykeys = keys[kmask], keys[~kmask] - newchks = [int(m) for m in movingkeys/size] # element-wise integer division that works in Python 2 and 3 - labels = mod(movingkeys, size) - return tuple(stationarykeys) + tuple(newchks) + tuple(chks) + tuple(labels), data - - rdd = self._rdd.map(_relabel) - - # group the new chunks together - nchunks = result.getnumber(result.plan, result.vshape) - npartitions = int(prod(result.kshape) * prod(nchunks)) - ranges = tuple(result.kshape) + tuple(nchunks) - n = len(axes) - if n == 0: - s = slice(None) - else: - s = slice(-n) - partitioner = lambda k: ravel_multi_index(k[s], ranges) - - rdd = rdd.partitionBy(numPartitions=npartitions, partitionFunc=partitioner) - - # reassemble the pieces in the chunks by sorting and then stacking - uniform = result.uniform - - def _rebuild(it): - ordered = sorted(it, key=lambda kv: kv[0][n:]) - keys, data = zip(*ordered) - - k = keys[0][s] - labels = asarray([x[-n:] for x in keys]) - - if uniform: - labelshape = tuple(size) - else: - labelshape = tuple(amax(labels, axis=0) - amin(labels, axis=0) + 1) - - valshape = data[0].shape - fullshape = labelshape + valshape - yield k, asarray(data).reshape(fullshape) - - result._rdd = rdd.mapPartitions(_rebuild) - - if array_equal(self.vshape, [1]): - result._rdd = result._rdd.mapValues(lambda v: squeeze(v)) - result._shape = result.shape[:-1] - result._plan = result.plan[:-1] - - return result - - def values_to_keys(self, axes): - - vmask = self.vmask(axes) - split = self.split - - # update properties - newplan = self.plan[~vmask] - newsplit = split + len(axes) - newshape = tuple(r_[self.kshape, self.vshape[vmask], self.vshape[~vmask]].astype(int).tolist()) - newpadding = self.padding[~vmask] - - result = self._constructor(None, shape=newshape, split=newsplit, - dtype=self.dtype, plan=newplan, padding=newpadding, ordered=self._ordered) - - # remove padding - if self.padded: - plan, padding = self.plan, self.padding - nchunks = self.getnumber(plan, self.vshape) - removepad = self.removepad - rdd = self._rdd.map(lambda kv: (kv[0], removepad(kv[0][split:], kv[1], nchunks, padding, axes=axes))) - else: - rdd = self._rdd - - # extract new records - slices = [None if vmask[i] else slice(0, self.vshape[i], 1) for i in range(len(vmask))] - slices = asarray(slices) - - movingsizes = self.plan[vmask] - split = self.split - def _extract(record): - - keys, data = record - k, chk = keys[:split], keys[split:] - - movingchks = asarray(chk)[vmask] - newchks = tuple(asarray(chk)[~vmask]) - keyoffsets = prod([movingchks, movingsizes], axis=0) - - bounds = asarray(data.shape)[vmask] - indices = list(product(*map(lambda x: arange(x), bounds))) - - for b in indices: - s = slices.copy() - s[vmask] = b - newdata = data[tuple(s)] - newkeys = tuple(r_[k, keyoffsets + b].astype('int')) - yield newkeys + newchks, newdata - - result._rdd = rdd.flatMap(_extract) - - if len(result.vshape) == 0: - result._rdd = result._rdd.mapValues(lambda v: array(v, ndmin=1)) - result._shape = result._shape + (1,) - result._plan = (1,) - result._padding = array([0]) - - return result - - def map(self, func, value_shape=None, dtype=None): - """ - Apply an array -> array function on each subarray. - - The function can change the shape of the subarray, but only along - dimensions that are not chunked. - - Parameters - ---------- - func : function - Function of a single subarray to apply - - value_shape: - Known shape of chunking plan after the map - - dtype: numpy.dtype, optional, default=None - Known dtype of values resulting from operation - - Returns - ------- - ChunkedArray - """ - - if value_shape is None or dtype is None: - # try to compute the size of each mapped element by applying func to a random array - try: - mapped = func(random.randn(*self.plan).astype(self.dtype)) - except Exception: - first = self._rdd.first() - if first: - # eval func on the first element - mapped = func(first[1]) - if value_shape is None: - value_shape = mapped.shape - if dtype is None: - dtype = mapped.dtype - - chunked_dims = where(self.plan != self.vshape)[0] - unchunked_dims = where(self.plan == self.vshape)[0] - - # check that no dimensions are dropped - if len(value_shape) != len(self.plan): - raise NotImplementedError('map on ChunkedArray cannot drop dimensions') - - # check that chunked dimensions did not change shape - if any([value_shape[i] != self.plan[i] for i in chunked_dims]): - raise ValueError('map cannot change the sizes of chunked dimensions') - - def check_and_apply(v): - new = func(v) - if len(unchunked_dims) > 0: - if any([new.shape[i] != value_shape[i] for i in unchunked_dims]): - raise Exception("Map operation did not produce values of uniform shape.") - if len(chunked_dims) > 0: - if any([v.shape[i] != new.shape[i] for i in chunked_dims]): - raise Exception("Map operation changed the size of a chunked dimension") - return new - - rdd = self._rdd.mapValues(check_and_apply) - - vshape = [value_shape[i] if i in unchunked_dims else self.vshape[i] for i in range(len(self.vshape))] - newshape = r_[self.kshape, vshape].astype(int).tolist() - - return self._constructor(rdd, shape=tuple(newshape), dtype=dtype, - plan=asarray(value_shape)).__finalize__(self) - - def map_generic(self, func): - """ - Apply a generic array -> object to each subarray - - The resulting object is a BoltArraySpark of dtype object where the - blocked dimensions are replaced with indices indication block ID. - """ - def process_record(val): - newval = empty(1, dtype="object") - newval[0] = func(val) - return newval - - rdd = self._rdd.mapValues(process_record) - - nchunks = self.getnumber(self.plan, self.vshape) - newshape = tuple([int(s) for s in r_[self.kshape, nchunks]]) - newsplit = len(self.shape) - return BoltArraySpark(rdd, shape=newshape, split=newsplit, ordered=self._ordered, dtype="object") - - def getplan(self, size="150", axes=None, padding=None): - """ - Identify a plan for chunking values along each dimension. - - Generates an ndarray with the size (in number of elements) of chunks - in each dimension. If provided, will estimate chunks for only a - subset of axes, leaving all others to the full size of the axis. - - Parameters - ---------- - size : string or tuple - If str, the average size (in KB) of the chunks in all value dimensions. - If int/tuple, an explicit specification of the number chunks in - each moving value dimension. - - axes : tuple, optional, default=None - One or more axes to estimate chunks for, if provided any - other axes will use one chunk. - - padding : tuple or int, option, default=None - Size over overlapping padding between chunks in each dimension. - If tuple, specifies padding along each chunked dimension; if int, - all dimensions use same padding; if None, no padding - """ - from numpy import dtype as gettype - - # initialize with all elements in one chunk - plan = self.vshape - - # check for subset of axes - if axes is None: - if isinstance(size, str): - axes = arange(len(self.vshape)) - else: - axes = arange(len(size)) - else: - axes = asarray(axes, 'int') - - # set padding - pad = array(len(self.vshape)*[0, ]) - if padding is not None: - pad[axes] = padding - - # set the plan - if isinstance(size, tuple): - plan[axes] = size - - elif isinstance(size, str): - # convert from kilobytes - size = 1000.0 * float(size) - - # calculate from dtype - elsize = gettype(self.dtype).itemsize - nelements = prod(self.vshape) - dims = self.vshape[self.vmask(axes)] - - if size <= elsize: - s = ones(len(axes)) - - else: - remsize = 1.0 * nelements * elsize - s = [] - for (i, d) in enumerate(dims): - minsize = remsize/d - if minsize >= size: - s.append(1) - remsize = minsize - continue - else: - s.append(min(d, floor(size/minsize))) - s[i+1:] = plan[i+1:] - break - - plan[axes] = s - - else: - raise ValueError("Chunk size not understood, must be tuple or int") - - return plan, pad - - @staticmethod - def removepad(idx, value, number, padding, axes=None): - """ - Remove the padding from chunks. - - Given a chunk and its corresponding index, use the plan and padding to remove any - padding from the chunk along with specified axes. - - Parameters - ---------- - idx: tuple or array-like - The chunk index, indicating which chunk this is. - - value: ndarray - The chunk that goes along with the index. - - number: ndarray or array-like - The number of chunks along each dimension. - - padding: ndarray or array-like - The padding scheme. - - axes: tuple, optional, default = None - The axes (in the values) along which to remove padding. - """ - if axes is None: - axes = range(len(number)) - mask = len(number)*[False, ] - for i in range(len(mask)): - if i in axes and padding[i] != 0: - mask[i] = True - - starts = [0 if (i == 0 or not m) else p for (i, m, p) in zip(idx, mask, padding)] - stops = [None if (i == n-1 or not m) else -p for (i, m, p, n) in zip(idx, mask, padding, number)] - slices = [slice(i1, i2) for (i1, i2) in zip(starts, stops)] - - return value[slices] - - @staticmethod - def getnumber(plan, shape): - """ - Obtain number of chunks for the given dimensions and chunk sizes. - - Given a plan for the number of chunks along each dimension, - calculate the number of chunks that this will lead to. - - Parameters - ---------- - plan: tuple or array-like - Size of chunks (in number of elements) along each dimensions. - Length must be equal to the number of dimensions. - - shape : tuple - Shape of array to be chunked. - """ - nchunks = [] - for size, d in zip(plan, shape): - nchunks.append(int(ceil(1.0 * d/size))) - return nchunks - - @staticmethod - def getslices(plan, padding, shape): - """ - Obtain slices for the given dimensions, padding, and chunks. - - Given a plan for the number of chunks along each dimension and the amount of padding, - calculate a list of slices required to generate those chunks. - - Parameters - ---------- - plan: tuple or array-like - Size of chunks (in number of elements) along each dimensions. - Length must be equal to the number of dimensions. - - padding: tuple or array-like - Size of overlap (in number of elements) between chunks along each dimension. - Length must be equal to the number of dimensions. - - shape: tuple - Dimensions of axes to be chunked. - """ - slices = [] - for size, pad, d in zip(plan, padding, shape): - nchunks = int(floor(d/size)) - remainder = d % size - start = 0 - dimslices = [] - for idx in range(nchunks): - end = start + size - # left endpoint - if idx == 0: - left = start - else: - left = start - pad - # right endpoint - if idx == nchunks: - right = end - else: - right = end + pad - dimslices.append(slice(left, right, 1)) - start = end - if remainder: - dimslices.append(slice(end - pad, d, 1)) - slices.append(dimslices) - return slices - - @staticmethod - def getmask(inds, n): - """ - Obtain a binary mask by setting a subset of entries to true. - - Parameters - ---------- - inds : array-like - Which indices to set as true. - - n : int - The length of the target mask. - """ - inds = asarray(inds, 'int') - mask = zeros(n, dtype=bool) - mask[inds] = True - return mask - - def tordd(self): - """ - Return the RDD wrapped by the ChunkedArray. - - Returns - ------- - RDD - """ - return self._rdd - - def cache(self): - """ - Cache the underlying RDD in memory. - """ - self._rdd.cache() - - def unpersist(self): - """ - Remove the underlying RDD from memory. - """ - self._rdd.unpersist() - - def __str__(self): - s = "Chunked BoltArray\n" - s += "shape: %s\n" % str(self.shape) - return s - - def __repr__(self): - string = str(self) - if array_equal(self.vshape, [1]): - newlines = [i for (i, char) in enumerate(string) if char=='\n'] - string = string[:newlines[-2]+1] - string += "shape: %s\n" % str(self.shape[:-1]) - string += "chunk size: %s\n" % str(tuple(self.plan)) - if self.padded: - string += "padding: %s\n" % str(tuple(self.padding)) - else: - string += "padding: none\n" - - return string diff --git a/bolt/spark/construct.py b/bolt/spark/construct.py deleted file mode 100644 index 6083e6e..0000000 --- a/bolt/spark/construct.py +++ /dev/null @@ -1,222 +0,0 @@ -from numpy import unravel_index, prod, arange, asarray, float64 - -from itertools import product - -from bolt.construct import ConstructBase -from bolt.spark.array import BoltArraySpark -from bolt.spark.utils import get_kv_shape, get_kv_axes - - -class ConstructSpark(ConstructBase): - - @staticmethod - def array(a, context=None, axis=(0,), dtype=None, npartitions=None): - """ - Create a spark bolt array from a local array. - - Parameters - ---------- - a : array-like - An array, any object exposing the array interface, an - object whose __array__ method returns an array, or any - (nested) sequence. - - context : SparkContext - A context running Spark. (see pyspark) - - axis : tuple, optional, default=(0,) - Which axes to distribute the array along. The resulting - distributed object will use keys to represent these axes, - with the remaining axes represented by values. - - dtype : data-type, optional, default=None - The desired data-type for the array. If None, will - be determined from the data. (see numpy) - - npartitions : int - Number of partitions for parallization. - - Returns - ------- - BoltArraySpark - """ - if dtype is None: - arry = asarray(a) - dtype = arry.dtype - else: - arry = asarray(a, dtype) - shape = arry.shape - ndim = len(shape) - - # handle the axes specification and transpose if necessary - axes = ConstructSpark._format_axes(axis, arry.shape) - key_axes, value_axes = get_kv_axes(arry.shape, axes) - permutation = key_axes + value_axes - arry = arry.transpose(*permutation) - split = len(axes) - - if split < 1: - raise ValueError("split axis must be greater than 0, got %g" % split) - if split > len(shape): - raise ValueError("split axis must not exceed number of axes %g, got %g" % (ndim, split)) - - key_shape = shape[:split] - val_shape = shape[split:] - - keys = zip(*unravel_index(arange(0, int(prod(key_shape))), key_shape)) - vals = arry.reshape((prod(key_shape),) + val_shape) - - rdd = context.parallelize(zip(keys, vals), npartitions) - return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) - - @staticmethod - def ones(shape, context=None, axis=(0,), dtype=float64, npartitions=None): - """ - Create a spark bolt array of ones. - - Parameters - ---------- - shape : tuple - The desired shape of the array. - - context : SparkContext - A context running Spark. (see pyspark) - - axis : tuple, optional, default=(0,) - Which axes to distribute the array along. The resulting - distributed object will use keys to represent these axes, - with the remaining axes represented by values. - - dtype : data-type, optional, default=float64 - The desired data-type for the array. If None, will - be determined from the data. (see numpy) - - npartitions : int - Number of partitions for parallization. - - Returns - ------- - BoltArraySpark - """ - from numpy import ones - return ConstructSpark._wrap(ones, shape, context, axis, dtype, npartitions) - - @staticmethod - def zeros(shape, context=None, axis=(0,), dtype=float64, npartitions=None): - """ - Create a spark bolt array of zeros. - - Parameters - ---------- - shape : tuple - The desired shape of the array. - - context : SparkContext - A context running Spark. (see pyspark) - - axis : tuple, optional, default=(0,) - Which axes to distribute the array along. The resulting - distributed object will use keys to represent these axes, - with the remaining axes represented by values. - - dtype : data-type, optional, default=float64 - The desired data-type for the array. If None, will - be determined from the data. (see numpy) - - npartitions : int - Number of partitions for parallization. - - Returns - ------- - BoltArraySpark - """ - from numpy import zeros - return ConstructSpark._wrap(zeros, shape, context, axis, dtype, npartitions) - - @staticmethod - def concatenate(arrays, axis=0): - """ - Join two bolt arrays together, at least one of which is in spark. - - Parameters - ---------- - arrays : tuple - A pair of arrays. At least one must be a spark array, - the other can be a local bolt array, a local numpy array, - or an array-like. - - axis : int, optional, default=0 - The axis along which the arrays will be joined. - - Returns - ------- - BoltArraySpark - """ - if not isinstance(arrays, tuple): - raise ValueError("data type not understood") - if not len(arrays) == 2: - raise NotImplementedError("spark concatenation only supports two arrays") - - first, second = arrays - if isinstance(first, BoltArraySpark): - return first.concatenate(second, axis) - elif isinstance(second, BoltArraySpark): - first = ConstructSpark.array(first, second._rdd.context) - return first.concatenate(second, axis) - else: - raise ValueError("at least one array must be a spark bolt array") - - @staticmethod - def _argcheck(*args, **kwargs): - """ - Check that arguments are consistent with spark array construction. - - Conditions are: - (1) a positional argument is a SparkContext - (2) keyword arg 'context' is a SparkContext - (3) an argument is a BoltArraySpark, or - (4) an argument is a nested list containing a BoltArraySpark - """ - try: - from pyspark import SparkContext - except ImportError: - return False - - cond1 = any([isinstance(arg, SparkContext) for arg in args]) - cond2 = isinstance(kwargs.get('context', None), SparkContext) - cond3 = any([isinstance(arg, BoltArraySpark) for arg in args]) - cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg]) - if isinstance(arg, (tuple, list)) else False for arg in args]) - return cond1 or cond2 or cond3 or cond4 - - @staticmethod - def _format_axes(axes, shape): - """ - Format target axes given an array shape - """ - if isinstance(axes, int): - axes = (axes,) - elif isinstance(axes, list) or hasattr(axes, '__iter__'): - axes = tuple(axes) - if not isinstance(axes, tuple): - raise ValueError("axes argument %s in the constructor not specified correctly" % str(axes)) - if min(axes) < 0 or max(axes) > len(shape) - 1: - raise ValueError("invalid key axes %s given shape %s" % (str(axes), str(shape))) - return axes - - @staticmethod - def _wrap(func, shape, context=None, axis=(0,), dtype=None, npartitions=None): - """ - Wrap an existing numpy constructor in a parallelized construction - """ - if isinstance(shape, int): - shape = (shape,) - key_shape, value_shape = get_kv_shape(shape, ConstructSpark._format_axes(axis, shape)) - split = len(key_shape) - - # make the keys - rdd = context.parallelize(list(product(*[arange(x) for x in key_shape])), npartitions) - - # use a map to make the arrays in parallel - rdd = rdd.map(lambda x: (x, func(value_shape, dtype, order='C'))) - return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) diff --git a/bolt/spark/shapes.py b/bolt/spark/shapes.py deleted file mode 100644 index 0624a8d..0000000 --- a/bolt/spark/shapes.py +++ /dev/null @@ -1,167 +0,0 @@ -from numpy import unravel_index, ravel_multi_index - -from bolt.utils import argpack, istransposeable, isreshapeable -from bolt.spark.array import BoltArraySpark - - -class Shapes(object): - """ - Base Shape class. These classes wrap a BoltArraySpark in their - entirity, but implement the following attributes and methods as if - they were only working on the keys or the values, depending which - subclass is used. - """ - @property - def shape(self): - raise NotImplementedError - - @property - def ndim(self): - return len(self.shape) - - def reshape(self): - raise NotImplementedError - - def transpose(self): - raise NotImplementedError - -class Keys(Shapes): - """ - This class implements all the base shape attributes and methods - for the keys of a BoltArraySpark. - """ - def __init__(self, barray): - self._barray = barray - - @property - def shape(self): - return self._barray.shape[:self._barray.split] - - def reshape(self, *shape): - """ - Reshape just the keys of a BoltArraySpark, returning a - new BoltArraySpark. - - Parameters - ---------- - shape : tuple - New proposed axes. - """ - new = argpack(shape) - old = self.shape - isreshapeable(new, old) - - if new == old: - return self._barray - - def f(k): - return unravel_index(ravel_multi_index(k, old), new) - - newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) - newsplit = len(new) - newshape = new + self._barray.values.shape - - return BoltArraySpark(newrdd, shape=newshape, split=newsplit).__finalize__(self._barray) - - def transpose(self, *axes): - """ - Transpose just the keys of a BoltArraySpark, returning a - new BoltArraySpark. - - Parameters - ---------- - axes : tuple - New proposed axes. - """ - new = argpack(axes) - old = range(self.ndim) - istransposeable(new, old) - - if new == old: - return self._barray - - def f(k): - return tuple(k[i] for i in new) - - newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) - newshape = tuple(self.shape[i] for i in new) + self._barray.values.shape - - return BoltArraySpark(newrdd, shape=newshape, ordered=False).__finalize__(self._barray) - - def __str__(self): - s = "BoltArray Keys\n" - s += "shape: %s" % str(self.shape) - return s - - def __repr__(self): - return str(self) - -class Values(Shapes): - """ - This class implements all the base shape attributes and methods - for the values of a BoltArraySpark. - """ - def __init__(self, barray): - self._barray = barray - - @property - def shape(self): - return self._barray.shape[self._barray.split:] - - def reshape(self, *shape): - """ - Reshape just the values of a BoltArraySpark, returning a - new BoltArraySpark. - - Parameters - ---------- - shape : tuple - New proposed axes. - """ - new = argpack(shape) - old = self.shape - isreshapeable(new, old) - - if new == old: - return self._barray - - def f(v): - return v.reshape(new) - - newrdd = self._barray._rdd.mapValues(f) - newshape = self._barray.keys.shape + new - - return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) - - def transpose(self, *axes): - """ - Transpose just the values of a BoltArraySpark, returning a - new BoltArraySpark. - - Parameters - ---------- - axes : tuple - New proposed axes. - """ - new = argpack(axes) - old = range(self.ndim) - istransposeable(new, old) - - if new == old: - return self._barray - - def f(v): - return v.transpose(new) - - newrdd = self._barray._rdd.mapValues(f) - newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new) - - return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) - - def __str__(self): - s = "BoltArray Values\n" - s += "shape: %s" % str(self.shape) - return s - - def __repr__(self): - return str(self) diff --git a/bolt/spark/stack.py b/bolt/spark/stack.py deleted file mode 100644 index cf5ddb8..0000000 --- a/bolt/spark/stack.py +++ /dev/null @@ -1,154 +0,0 @@ -from numpy import asarray, ndarray, concatenate -from bolt.spark.utils import zip_with_index - -class StackedArray(object): - """ - Wraps a BoltArraySpark and provides an interface for performing - stacked operations (operations on aggregated subarrays). Many methods - will be restricted or forbidden until the Stacked object is - unstacked. Currently, only map() is implemented. The rationale - is that many operations will work faster when vectorized over a - slightly larger array. - - The implementation uses an intermediate RDD that collects all - records on a given partition into 'stacked' (key, value) records. - Here, a key is a 'size' long tuple of original record keys, - and and values is a an array of the corresponding values, - concatenated along a new 0th dimenion. - """ - _metadata = ['_rdd', '_shape', '_split', '_rekeyed'] - - def __init__(self, rdd, shape=None, split=None, rekeyed=False): - self._rdd = rdd - self._shape = shape - self._split = split - self._rekeyed = rekeyed - - def __finalize__(self, other): - for name in self._metadata: - other_attr = getattr(other, name, None) - if (other_attr is not None) and (getattr(self, name, None) is None): - object.__setattr__(self, name, other_attr) - return self - - @property - def shape(self): - return self._shape - - @property - def split(self): - return self._split - - @property - def rekey(self): - return self._rekeyed - - @property - def _constructor(self): - return StackedArray - - def stack(self, size): - """ - Make an intermediate RDD where all records are combined into a - list of keys and larger ndarray along a new 0th dimension. - """ - def tostacks(partition): - keys = [] - arrs = [] - for key, arr in partition: - keys.append(key) - arrs.append(arr) - if size and 0 <= size <= len(keys): - yield (keys, asarray(arrs)) - keys, arrs = [], [] - if keys: - yield (keys, asarray(arrs)) - - rdd = self._rdd.mapPartitions(tostacks) - return self._constructor(rdd).__finalize__(self) - - def unstack(self): - """ - Unstack array and return a new BoltArraySpark via flatMap(). - """ - from bolt.spark.array import BoltArraySpark - - if self._rekeyed: - rdd = self._rdd - else: - rdd = self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))) - - return BoltArraySpark(rdd, shape=self.shape, split=self.split) - - def map(self, func): - """ - Apply a function on each subarray. - - Parameters - ---------- - func : function - This is applied to each value in the intermediate RDD. - - Returns - ------- - StackedArray - """ - vshape = self.shape[self.split:] - x = self._rdd.values().first() - if x.shape == vshape: - a, b = asarray([x]), asarray([x, x]) - else: - a, b = x, concatenate((x, x)) - - try: - atest = func(a) - btest = func(b) - except Exception as e: - raise RuntimeError("Error evaluating function on test array, got error:\n %s" % e) - - if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)): - raise ValueError("Function must return ndarray") - - # different shapes map to the same new shape - elif atest.shape == btest.shape: - if self._rekeyed is True: - # we've already rekeyed - rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) - shape = (self.shape[0],) + atest.shape - else: - # do the rekeying - count, rdd = zip_with_index(self._rdd.values()) - rdd = rdd.map(lambda kv: ((kv[1],), func(kv[0]))) - shape = (count,) + atest.shape - split = 1 - rekeyed = True - - # different shapes stay different (along the first dimension) - elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]: - shape = self.shape[0:self.split] + atest.shape[1:] - split = self.split - rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) - rekeyed = self._rekeyed - - else: - raise ValueError("Cannot infer effect of function on shape") - - return self._constructor(rdd, rekeyed=rekeyed, shape=shape, split=split).__finalize__(self) - - def tordd(self): - """ - Return the RDD wrapped by the StackedArray. - - Returns - ------- - RDD - """ - return self._rdd - - def __str__(self): - s = "Stacked BoltArray\n" - s += "shape: %s\n" % str(self.shape) - return s - - def __repr__(self): - return str(self) diff --git a/bolt/spark/statcounter.py b/bolt/spark/statcounter.py deleted file mode 100644 index 162f6eb..0000000 --- a/bolt/spark/statcounter.py +++ /dev/null @@ -1,130 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This file is ported from spark/util/StatCounter.scala -# -# This code is based on pyspark's statcounter.py and used under the ASF 2.0 license. - -import copy -from itertools import chain - -from numpy import sqrt - - -class StatCounter(object): - - REQUIRED_FOR = { - 'mean': ('mu',), - 'sum': ('mu',), - 'variance': ('mu', 'm2'), - 'stdev': ('mu', 'm2'), - 'all': ('mu', 'm2') - } - - def __init__(self, values=(), stats='all'): - self.n = 0 - self.mu = 0.0 - self.m2 = 0.0 - - if isinstance(stats, str): - stats = [stats] - self.required = frozenset(chain().from_iterable([StatCounter.REQUIRED_FOR[stat] for stat in stats])) - - for v in values: - self.merge(v) - - # add a value into this StatCounter, updating the statistics - def merge(self, value): - self.n += 1 - if self.__requires('mu'): - delta = value - self.mu - self.mu += delta / self.n - if self.__requires('m2'): - self.m2 += delta * (value - self.mu) - - return self - - # checks whether the passed attribute name is required to be updated in order to support the - # statistics requested in self.requested - def __requires(self, attrname): - return attrname in self.required - - # merge another StatCounter into this one, adding up the statistics - def combine(self, other): - if not isinstance(other, StatCounter): - raise Exception("can only merge StatCounters!") - - # reference equality holds - if other is self: - # avoid overwriting fields in a weird order - self.merge(copy.deepcopy(other)) - else: - # accumulator should only be updated if it's valid in both statcounters - self.required = set(self.required).intersection(set(other.required)) - - if self.n == 0: - self.n = other.n - for attrname in ('mu', 'm2'): - if self.__requires(attrname): - setattr(self, attrname, getattr(other, attrname)) - - elif other.n != 0: - if self.__requires('mu'): - delta = other.mu - self.mu - if other.n * 10 < self.n: - self.mu = self.mu + (delta * other.n) / (self.n + other.n) - elif self.n * 10 < other.n: - self.mu = other.mu - (delta * self.n) / (self.n + other.n) - else: - self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n) - - if self.__requires('m2'): - self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n) - - self.n += other.n - return self - - def count(self): - return self.n - - def __isavail(self, attrname): - if not all(attr in self.required for attr in StatCounter.REQUIRED_FOR[attrname]): - raise ValueError("'%s' stat not available, must be requested at " - "StatCounter instantiation" % attrname) - - @property - def mean(self): - self.__isavail('mean') - return self.mu - - @property - def sum(self): - self.__isavail('sum') - return self.n * self.mu - - @property - def variance(self): - self.__isavail('variance') - if self.n == 0: - return float('nan') - else: - return self.m2 / self.n - - @property - def stdev(self): - self.__isavail('stdev') - return sqrt(self.variance) diff --git a/bolt/spark/utils.py b/bolt/spark/utils.py deleted file mode 100644 index c006dd2..0000000 --- a/bolt/spark/utils.py +++ /dev/null @@ -1,31 +0,0 @@ -def get_kv_shape(shape, key_axes): - func = lambda axis: shape[axis] - return _get_kv_func(func, shape, key_axes) - -def get_kv_axes(shape, key_axes): - func = lambda axis: axis - return _get_kv_func(func, shape, key_axes) - -def _get_kv_func(func, shape, key_axes): - key_res = [func(axis) for axis in key_axes] - value_res = [func(axis) for axis in range(len(shape)) if axis not in key_axes] - return key_res, value_res - -def zip_with_index(rdd): - """ - Alternate version of Spark's zipWithIndex that eagerly returns count. - """ - starts = [0] - if rdd.getNumPartitions() > 1: - nums = rdd.mapPartitions(lambda it: [sum(1 for _ in it)]).collect() - count = sum(nums) - for i in range(len(nums) - 1): - starts.append(starts[-1] + nums[i]) - else: - count = rdd.count() - - def func(k, it): - for i, v in enumerate(it, starts[k]): - yield v, i - - return count, rdd.mapPartitionsWithIndex(func) diff --git a/test/generic.py b/test/generic.py index 62fe588..a94010a 100644 --- a/test/generic.py +++ b/test/generic.py @@ -94,24 +94,24 @@ def reduce_suite(arr, b): # Reduce over the first axis with an add reduced = b.reduce(add, axis=0) - res = reduced.toarray() + res = reduced assert res.shape == (arr.shape[1], arr.shape[2]) assert allclose(res, sum(arr, 0)) # Reduce over multiple axes with an add reduced = b.reduce(add, axis=(0, 1)) - res = reduced.toarray() + res = reduced assert res.shape == (arr.shape[2],) assert allclose(res, sum(sum(arr, 0), 1)) # Reduce over various other axes with an add reduced = b.reduce(add, axis=1) - res = reduced.toarray() + res = reduced assert res.shape == (arr.shape[0], arr.shape[2]) assert allclose(res, sum(arr, 1)) reduced = b.reduce(add, axis=(1, 2)) - res = reduced.toarray() + res = reduced assert res.shape == (arr.shape[0],) assert allclose(res, sum(sum(arr, 1), 1)) diff --git a/test/local/test_local_basic.py b/test/local/test_local_basic.py deleted file mode 100644 index 306426c..0000000 --- a/test/local/test_local_basic.py +++ /dev/null @@ -1,44 +0,0 @@ -from numpy import arange -from bolt import array -from bolt.spark.array import BoltArraySpark -from bolt.utils import allclose - - -def test_construct(): - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x) - assert b.shape == (2, 3, 4) - - -def test_toarray(): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x) - assert allclose(b.toarray(), x) - -def test_tospark(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x) - s = b.tospark(sc, axis=0) - assert isinstance(s, BoltArraySpark) - assert s.shape == (2, 3, 4) - assert allclose(s.toarray(), x) - -def test_tordd(sc): - - from pyspark import RDD - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x) - r = b.tordd(sc, axis=0) - assert isinstance(r, RDD) - assert r.count() == 2 - - r = b.tordd(sc, axis=(0, 1)) - assert isinstance(r, RDD) - assert r.count() == 2*3 - - r = b.tordd(sc, axis=(0, 1, 2)) - assert isinstance(r, RDD) - assert r.count() == 2*3*4 diff --git a/test/local/test_local_construct.py b/test/local/test_local_construct.py deleted file mode 100644 index 737b163..0000000 --- a/test/local/test_local_construct.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest -from numpy import arange -from bolt import array, ones, zeros, concatenate -from bolt.utils import allclose - - -def test_array(): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x) - assert allclose(x, b.toarray()) - -def test_ones(): - - from numpy import ones as npones - x = npones((2, 3, 4)) - b = ones((2, 3, 4)) - assert allclose(x, b.toarray()) - -def test_zeros(): - from numpy import zeros as npzeros - x = npzeros((2, 3, 4)) - b = zeros((2, 3, 4)) - assert allclose(x, b.toarray()) - -def test_concatenate(): - - from numpy import concatenate as npconcatenate - x = arange(2*3*4).reshape((2, 3, 4)) - b = concatenate((x, x)) - assert allclose(npconcatenate((x, x)), b.toarray()) - -def test_concatenate_errors(): - - x = arange(2*3*4).reshape((2, 3, 4)) - - with pytest.raises(ValueError): - concatenate(x) diff --git a/test/local/test_local_functional.py b/test/local/test_local_functional.py deleted file mode 100644 index 4c1ff82..0000000 --- a/test/local/test_local_functional.py +++ /dev/null @@ -1,52 +0,0 @@ -from numpy import arange, repeat -from bolt import array -from bolt.utils import allclose -import generic - - -def test_map(): - - import random - random.seed(42) - - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x) - - # Test all generic map functionality - generic.map_suite(x, b) - - -def test_reduce(): - - from numpy import asarray - - dims = (10, 10, 10) - area = dims[0] * dims[1] - arr = asarray([repeat(x,area).reshape(dims[0], dims[1]) for x in range(dims[2])]) - b = array(arr) - - # Test all generic reduce functionality - generic.reduce_suite(arr, b) - - -def test_filter(): - - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x) - - # Test all generic filter functionality - generic.filter_suite(x, b) - -def test_ufuncs(): - - x = arange(2*3*4*5).reshape(2, 3, 4, 5) - b = array(x) - - # test a common ufunc (sum) over different dimensions - assert allclose(x.sum(axis=0), b.sum(axis=0).toarray()) - assert allclose(x.sum(axis=(0, 1)), b.sum(axis=(0, 1)).toarray()) - assert allclose(x.sum(axis=(0, 1, 2)), b.sum(axis=(0, 1, 2)).toarray()) - assert x.sum() == b.sum() - - - diff --git a/test/spark/test_spark_basic.py b/test/spark/test_spark_basic.py deleted file mode 100644 index 6593af9..0000000 --- a/test/spark/test_spark_basic.py +++ /dev/null @@ -1,159 +0,0 @@ -from numpy import arange, dtype, int64, float64 -from bolt import array, ones -from bolt.utils import allclose - -def test_shape(sc): - - x = arange(2*3).reshape((2, 3)) - b = array(x, sc) - assert b.shape == x.shape - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc) - assert b.shape == x.shape - -def test_size(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=0) - assert b.size == x.size - -def test_split(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=0) - assert b.split == 1 - - b = array(x, sc, axis=(0, 1)) - assert b.split == 2 - -def test_ndim(sc): - - x = arange(2**5).reshape(2, 2, 2, 2, 2) - b = array(x, sc, axis=(0, 1, 2)) - - assert b.keys.ndim == 3 - assert b.values.ndim == 2 - assert b.ndim == 5 - -def test_mask(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=0) - assert b.mask == (1, 0, 0) - - b = array(x, sc, axis=(0, 1)) - assert b.mask == (1, 1, 0) - - b = array(x, sc, axis=(0, 1, 2)) - assert b.mask == (1, 1, 1) - -def test_cache(sc): - - x = arange(2*3).reshape((2, 3)) - b = array(x, sc) - b.cache() - assert b._rdd.is_cached - b.unpersist() - assert not b._rdd.is_cached - -def test_repartition(sc): - x = arange(2 * 3).reshape((2, 3)) - b = array(x, sc) - assert b._ordered - b = b.repartition(10) - assert not b._ordered - assert b._rdd.getNumPartitions() == 10 - -def test_concatenate(sc): - - from numpy import concatenate - x = arange(2*3).reshape((2, 3)) - b = array(x, sc) - c = array(x) - assert allclose(b.concatenate(x).toarray(), concatenate((x, x))) - assert allclose(b.concatenate(b).toarray(), concatenate((x, x))) - assert allclose(b.concatenate(c).toarray(), concatenate((x, x))) - -def test_dtype(sc): - - a = arange(2**8, dtype=int64) - b = array(a, sc, dtype=int64) - assert a.dtype == b.dtype - assert b.dtype == dtype(int64) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(int64) - - a = arange(2.0**8) - b = array(a, sc) - assert a.dtype == b.dtype - assert b.dtype == dtype(float64) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(float64) - - a = arange(2**8) - b = array(a, sc) - assert a.dtype == b.dtype - assert b.dtype == dtype(int64) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(int64) - - from numpy import ones as npones - a = npones(2**8, dtype=bool) - b = array(a, sc) - assert a.dtype == b.dtype - assert b.dtype == dtype(bool) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(bool) - - b = ones(2**8, sc) - assert b.dtype == dtype(float64) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(float64) - - b = ones(2**8, sc, dtype=bool) - assert b.dtype == dtype(bool) - dtypes = b._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(bool) - -def test_astype(sc): - - from numpy import ones as npones - - a = npones(2**8, dtype=int64) - b = array(a, sc, dtype=int64) - c = b.astype(bool) - assert c.dtype == dtype(bool) - dtypes = c._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(bool) - - b = ones((100, 100), sc, dtype=int64) - c = b.astype(bool) - assert c.dtype == dtype(bool) - dtypes = c._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(bool) - - b = ones((100, 100), sc) - c = b.astype(bool) - assert c.dtype == dtype(bool) - dtypes = c._rdd.map(lambda x: x[1].dtype).collect() - for dt in dtypes: - assert dt == dtype(bool) - -def test_clip(sc): - - from numpy import arange - - a = arange(4).reshape(2, 2) - b = array(a, sc) - assert allclose(b.clip(0).toarray(), a.clip(0)) - assert allclose(b.clip(2).toarray(), a.clip(2)) - assert allclose(b.clip(1, 2).toarray(), a.clip(1, 2)) \ No newline at end of file diff --git a/test/spark/test_spark_chunking.py b/test/spark/test_spark_chunking.py deleted file mode 100644 index 008106f..0000000 --- a/test/spark/test_spark_chunking.py +++ /dev/null @@ -1,208 +0,0 @@ -import pytest -from numpy import arange, split, array_equal, empty, newaxis -from bolt import array, ones -from bolt.utils import allclose - -def test_chunk(sc): - - x = arange(4*6).reshape(1, 4, 6) - b = array(x, sc) - - k1, v1 = zip(*b.chunk((2,3))._rdd.sortByKey().collect()) - k2 = ((0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1)) - v2 = [s for m in split(x[0], (2,), axis=0) for s in split(m, (3,), axis=1)] - assert k1 == k2 - assert all([allclose(m1, m2) for (m1, m2) in zip(v1, v2)]) - - k1, v1 = zip(*b.chunk((3,4))._rdd.sortByKey().collect()) - k2 = ((0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1)) - v2 = [s for m in split(x[0], (3,), axis=0) for s in split(m, (4,), axis=1)] - assert k1 == k2 - assert all([allclose(m1, m2) for (m1, m2) in zip(v1, v2)]) - -def test_unchunk(sc): - - x = arange(4*6).reshape(1, 4, 6) - b = array(x, sc) - - assert allclose(b.chunk((2, 3)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk((3, 4)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk((4, 6)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk('0.1').unchunk().toarray(), b.toarray()) - assert allclose(b.chunk().unchunk().toarray(), b.toarray()) - - x = arange(4*5*10).reshape(1, 4, 5, 10) - b = array(x, sc) - - assert allclose(b.chunk((4, 5, 10)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk((1, 1, 1)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk((3, 3, 3)).unchunk().toarray(), b.toarray()) - assert allclose(b.chunk((3, 3, 3)).unchunk().toarray(), b.toarray()) - - x = arange(4*6).reshape(4, 6) - b = array(x, sc, (0, 1)) - - assert allclose(b.chunk(()).unchunk().toarray(), b.toarray()) - - b = array(x, sc, (0,)) - - assert allclose(b.chunk((2)).unchunk().toarray(), b.toarray()) - -def test_keys_to_values(sc): - - x = arange(4*7*9*6).reshape(4, 7, 9, 6) - b = array(x, sc, (0, 1)) - c = b.chunk((4, 2)) - - assert allclose(x, c.keys_to_values((0,)).unchunk().toarray().transpose(1, 0, 2, 3)) - assert allclose(x, c.keys_to_values((1,)).unchunk().toarray()) - assert allclose(x, c.keys_to_values((1,), size=(3,)).unchunk().toarray()) - assert allclose(x, c.keys_to_values((0, 1)).unchunk().toarray()) - assert allclose(x, c.keys_to_values((0, 1), size=(2, 3)).unchunk().toarray()) - assert allclose(x, c.keys_to_values(()).unchunk().toarray()) - - b = array(x, sc, range(4)) - c = b.chunk(()) - - assert allclose(x, c.keys_to_values((3,)).unchunk().toarray()) - assert allclose(x, c.keys_to_values((0, 1)).unchunk().toarray().transpose(2, 3, 0, 1)) - - b = array(x, sc, (0,)) - c = b.chunk((2, 3, 4)) - - assert allclose(x, c.keys_to_values((0,)).unchunk().toarray()) - -def test_values_to_keys(sc): - - x = arange(4*7*9*6).reshape(4, 7, 9, 6) - b = array(x, sc, (0, 1)) - c = b.chunk((4, 2)) - - assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) - assert allclose(x, c.values_to_keys((1,)).unchunk().toarray().transpose(0, 1, 3, 2)) - assert allclose(x, c.values_to_keys((0, 1)).unchunk().toarray()) - assert allclose(x, c.values_to_keys(()).unchunk().toarray()) - - b = array(x, sc, (0,)) - c = b.chunk((2, 3, 4)) - - assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) - assert allclose(x, c.values_to_keys((0, 1)).unchunk().toarray()) - - -def test_padding(sc): - - x = arange(2*2*5*6).reshape(2, 2, 5, 6) - b = array(x, sc, (0, 1)) - - c = b.chunk((2, 2), padding=1) - chunks = c.tordd().sortByKey().values().collect() - assert allclose(chunks[0], array([[0, 1, 2], [6, 7, 8], [12, 13, 14]])) - assert allclose(chunks[1], array([[1, 2, 3, 4], [7, 8, 9, 10], [13, 14, 15, 16]])) - assert allclose(chunks[4], array([[7, 8, 9, 10], [13, 14, 15, 16], [19, 20, 21, 22], [25, 26, 27, 28]])) - assert allclose(chunks[6], array([[18, 19, 20], [24, 25, 26]])) - - c = b.chunk((3, 3), padding=(1, 2)) - chunks = c.tordd().sortByKey().values().collect() - assert allclose(chunks[0], array([[0, 1, 2, 3, 4], [6, 7, 8, 9, 10], [12, 13, 14, 15, 16], [18, 19, 20, 21, 22]])) - - c = b.chunk((2,2), padding=1) - assert allclose(x, c.unchunk().toarray()) - assert allclose(x, c.keys_to_values((1,)).unchunk().toarray()) - assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) - -def test_padding_errors(sc): - - x = arange(2*2*5*6).reshape(2, 2, 5, 6) - b = array(x, sc, (0, 1)) - - with pytest.raises(ValueError): - c = b.chunk((2, 2), padding=(3, 1)) - - with pytest.raises(ValueError): - c = b.chunk((4, 4), padding=(2, 2)) - - with pytest.raises(NotImplementedError): - c = b.chunk((2, 2), padding=1) - d = c.map(lambda x: x[:, 0]) - -def test_map(sc): - - x = arange(4*8*8).reshape(4, 8, 8) - b = array(x, sc) - - c = b.chunk(size=(4, 8)) - - # no change of shape - def f(x): - return 2*x - - assert allclose(c.map(f).unchunk().toarray(), f(x)) - assert allclose(c.map(f, value_shape=(4, 8)).unchunk().toarray(), f(x)) - - # changing the size of an unchunked axis - def f(x): - return x[:, :4] - def f_local(x): - return x[:, :, :4] - - assert allclose(c.map(f).unchunk().toarray(), f_local(x)) - assert allclose(c.map(f, value_shape=(4, 4)).unchunk().toarray(), f_local(x)) - -def test_map_errors(sc): - - x = arange(4*8*8).reshape(4, 8, 8) - b = array(x, sc) - - c = b.chunk(size=(4, 8)) - - # changing the size of a chunked axis - def f(x): - return x[:2, :] - - with pytest.raises(ValueError): - c.map(f) - - with pytest.raises(ValueError): - c.map(f, value_shape=(2, 8)) - - # dropping dimensions - def f(x): - return x[0, :] - - with pytest.raises(NotImplementedError): - c.map(f) - - with pytest.raises(NotImplementedError): - c.map(f, value_shape=(4,)) - -def test_map_generic(sc): - - x = arange(2*8*8).reshape(2, 8, 8) - b = array(x, sc) - - c = b.chunk(size=(8, 5)) - d = c.map_generic(lambda x: [0, 1]).toarray() - - truth = empty(2*1*2, dtype=object) - for i in range(truth.shape[0]): - truth[i] = [0, 1] - truth = truth.reshape(2, 1, 2) - - assert array_equal(d, truth) - -def test_properties(sc): - - x = arange(4*6).reshape(1, 4, 6) - b = array(x, sc) - - assert b.chunk(size=(2, 3)).uniform is True - assert b.chunk(size=(2, 4)).uniform is False - -def test_args(sc): - - x = arange(4*6).reshape(1, 4, 6) - b = array(x, sc) - - with pytest.raises(ValueError): - b.chunk(size=(5, 6)) diff --git a/test/spark/test_spark_construct.py b/test/spark/test_spark_construct.py deleted file mode 100644 index e8dc779..0000000 --- a/test/spark/test_spark_construct.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -from numpy import arange -from bolt import array, ones, zeros, concatenate -from bolt.utils import allclose -from bolt.spark.array import BoltArraySpark - -def test_array(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc) - assert isinstance(b, BoltArraySpark) - assert allclose(x, b.toarray()) - - b = array(x, sc, axis=0) - assert isinstance(b, BoltArraySpark) - assert allclose(x, b.toarray()) - - b = array(x, sc, axis=(0, 1)) - assert isinstance(b, BoltArraySpark) - assert allclose(x, b.toarray()) - - b = array(x, sc, axis=(0, 1), npartitions=5) - assert isinstance(b, BoltArraySpark) - assert allclose(x, b.toarray()) - assert b.tordd().getNumPartitions() == 5 - -def test_array_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - with pytest.raises(ValueError): - array(x, sc, axis=-1) - - with pytest.raises(ValueError): - array(x, sc, axis=(0, 1, 2, 3)) - -def test_ones(sc): - - from numpy import ones as npones - x = npones((2, 3, 4)) - b = ones((2, 3, 4), sc) - assert allclose(x, b.toarray()) - - x = npones(5) - b = ones(5, sc) - assert allclose(x, b.toarray()) - -def test_zeros(sc): - - from numpy import zeros as npzeros - x = npzeros((2, 3, 4)) - b = zeros((2, 3, 4), sc) - assert allclose(x, b.toarray()) - - x = npzeros(5) - b = zeros(5, sc) - assert allclose(x, b.toarray()) - -def test_concatenate(sc): - - from numpy import concatenate as npconcatenate - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=0) - bb = concatenate((b, b), axis=0) - assert allclose(npconcatenate((x, x), axis=0), bb.toarray()) - - bb = concatenate((b, b), axis=1) - assert allclose(npconcatenate((x, x), axis=1), bb.toarray()) - - bb = concatenate((b, b), axis=2) - assert allclose(npconcatenate((x, x), axis=2), bb.toarray()) - - b = array(x, sc, axis=(0, 1)) - bb = concatenate((b, b), axis=0) - assert allclose(npconcatenate((x, x), axis=0), bb.toarray()) - - b = array(x, sc, axis=(0, 1)) - bb = concatenate((b, b), axis=1) - assert allclose(npconcatenate((x, x), axis=1), bb.toarray()) - - b = array(x, sc, axis=(0, 1)) - bb = concatenate((b, b), axis=2) - assert allclose(npconcatenate((x, x), axis=2), bb.toarray()) - -def test_concatenate_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=0) - - with pytest.raises(ValueError): - concatenate(b) - - with pytest.raises(NotImplementedError): - concatenate((b, b, b)) diff --git a/test/spark/test_spark_functional.py b/test/spark/test_spark_functional.py deleted file mode 100644 index ad778dd..0000000 --- a/test/spark/test_spark_functional.py +++ /dev/null @@ -1,118 +0,0 @@ -import pytest -from numpy import arange, repeat -from bolt import array -from bolt.utils import allclose -import generic - -def test_map(sc): - import random - random.seed(42) - - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=0) - - # Test all map functionality when the base array is split after the first axis - generic.map_suite(x, b) - - # Split the BoltArraySpark after the second axis and rerun the tests - b = array(x, sc, axis=(0, 1)) - generic.map_suite(x, b) - - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests - b = array(x, sc, axis=(0, 1, 2)) - generic.map_suite(x, b) - -def test_map_with_keys(sc): - x = arange(2*3).reshape(2, 3) - b = array(x, sc, axis=0) - c = b.map(lambda kv: kv[0] + kv[1], with_keys=True) - assert allclose(b.toarray() + [[0, 0, 0], [1, 1, 1]], c.toarray()) - -def test_reduce(sc): - from numpy import asarray - - dims = (10, 10, 10) - area = dims[0] * dims[1] - arr = asarray([repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) - b = array(arr, sc, axis=0) - - # Test all reduce functionality when the base array is split after the first axis - generic.reduce_suite(arr, b) - - # Split the BoltArraySpark after the second axis and rerun the tests - b = array(arr, sc, axis=(0, 1)) - generic.reduce_suite(arr, b) - - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests - b = array(arr, sc, axis=(0, 1, 2)) - generic.reduce_suite(arr, b) - -def test_filter(sc): - - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=0) - - # Test all filter functionality when the base array is split after the first axis - generic.filter_suite(x, b) - - # Split the BoltArraySpark after the second axis and rerun the tests - b = array(x, sc, axis=(0, 1)) - generic.filter_suite(x, b) - - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests - b = array(x, sc, axis=(0, 1, 2)) - generic.filter_suite(x, b) - -def test_mean(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.mean(), x.mean()) - assert allclose(b.mean(axis=0), x.mean(axis=0)) - assert allclose(b.mean(axis=(0, 1)), x.mean(axis=(0, 1))) - assert b.mean(axis=(0, 1, 2)) == x.mean(axis=(0, 1, 2)) - -def test_std(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.std(), x.std()) - assert allclose(b.std(axis=0), x.std(axis=0)) - assert allclose(b.std(axis=(0, 1)), x.std(axis=(0, 1))) - assert b.std(axis=(0, 1, 2)) == x.std(axis=(0, 1, 2)) - -def test_var(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.var(), x.var()) - assert allclose(b.var(axis=0), x.var(axis=0)) - assert allclose(b.var(axis=(0, 1)), x.var(axis=(0, 1))) - assert b.var(axis=(0, 1, 2)) == x.var(axis=(0, 1, 2)) - -def test_sum(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.sum(), x.sum()) - assert allclose(b.sum(axis=0), x.sum(axis=0)) - assert allclose(b.sum(axis=(0, 1)), x.sum(axis=(0, 1))) - assert b.sum(axis=(0, 1, 2)) == x.sum(axis=(0, 1, 2)) - -def test_min(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.min(), x.min()) - assert allclose(b.min(axis=0), x.min(axis=0)) - assert allclose(b.min(axis=(0, 1)), x.min(axis=(0, 1))) - assert b.min(axis=(0, 1, 2)) == x.min(axis=(0, 1, 2)) - -def test_max(sc): - x = arange(2*3*4).reshape(2, 3, 4) - b = array(x, sc, axis=(0,)) - - assert allclose(b.max(), x.max()) - assert allclose(b.max(axis=0), x.max(axis=0)) - assert allclose(b.max(axis=(0, 1)), x.max(axis=(0, 1))) - assert b.max(axis=(0, 1, 2)) == x.max(axis=(0, 1, 2)) diff --git a/test/spark/test_spark_getting.py b/test/spark/test_spark_getting.py deleted file mode 100644 index 64f0358..0000000 --- a/test/spark/test_spark_getting.py +++ /dev/null @@ -1,170 +0,0 @@ -import pytest -from numpy import arange -from bolt import array, ones -from bolt.utils import allclose - - -def test_getitem_slice(sc): - x = arange(6*6).reshape((6, 6)) - - b = array(x, sc, axis=0) - assert allclose(b[0:1, 0:1].toarray(), x[0:1, 0:1]) - assert allclose(b[0:2, 0:2].toarray(), x[0:2, 0:2]) - assert allclose(b[0:2, 0:3].toarray(), x[0:2, 0:3]) - assert allclose(b[0:2, 0:3:2].toarray(), x[0:2, 0:3:2]) - assert allclose(b[:2, :2].toarray(), x[:2, :2]) - assert allclose(b[1:, 1:].toarray(), x[1:, 1:]) - assert allclose(b[5:1:-1, 5:1:-1].toarray(), x[5:1:-1, 5:1:-1]) - assert allclose(b[10:-10:-2, 10:-10:-2].toarray(), x[10:-10:-2, 10:-10:-2]) - assert allclose(b[-5:-1, -5:-1].toarray(), x[-5:-1, -5:-1]) - assert allclose(b[-1:-5:-2, -1:-5:-2].toarray(), x[-1:-5:-2, -1:-5:-2]) - - b = array(x, sc, axis=(0, 1)) - assert allclose(b[0:1, 0:1].toarray(), x[0:1, 0:1]) - assert allclose(b[0:2, 0:2].toarray(), x[0:2, 0:2]) - assert allclose(b[0:2, 0:3].toarray(), x[0:2, 0:3]) - assert allclose(b[0:2, 0:3:2].toarray(), x[0:2, 0:3:2]) - assert allclose(b[:2, :2].toarray(), x[:2, :2]) - assert allclose(b[1:, 1:].toarray(), x[1:, 1:]) - assert allclose(b[5:1:-1, 5:1:-1].toarray(), x[5:1:-1, 5:1:-1]) - assert allclose(b[10:-10:-2, 10:-10:-2].toarray(), x[10:-10:-2, 10:-10:-2]) - assert allclose(b[-5:-1, -5:-1].toarray(), x[-5:-1, -5:-1]) - assert allclose(b[-1:-5:-2, -1:-5:-2].toarray(), x[-1:-5:-2, -1:-5:-2]) - -def test_getitem_slice_ragged(sc): - - x = arange(10*10*3).reshape((10, 10, 3)) - - b = array(x, sc, axis=(0,1)) - assert allclose(b[0:5:2, 0:2].toarray(), x[0:5:2, 0:2]) - assert allclose(b[0:5:3, 0:2].toarray(), x[0:5:3, 0:2]) - assert allclose(b[0:9:3, 0:2].toarray(), x[0:9:3, 0:2]) - -def test_getitem_int(sc): - - x = arange(2*3).reshape((2, 3)) - - b = array(x, sc, axis=0) - assert allclose(b[0, 0], x[0, 0]) - assert allclose(b[0, 1], x[0, 1]) - assert allclose(b[0, 0:1], x[0, 0:1]) - assert allclose(b[1, 2], x[1, 2]) - assert allclose(b[0], x[0]) - assert allclose(b[[0]], x[[0]]) - assert allclose(b[(0)], x[(0)]) - assert allclose(b[[1], [2]], x[[1], [2]]) - assert allclose(b[[1], 2], x[[1], 2]) - assert allclose(b[-1, -2], x[-1, -2]) - - b = array(x, sc, axis=(0, 1)) - assert allclose(b[0, 0], x[0, 0]) - assert allclose(b[0, 1], x[0, 1]) - assert allclose(b[0, 0:1], x[0, 0:1]) - assert allclose(b[1, 2], x[1, 2]) - assert allclose(b[0], x[0]) - assert allclose(b[[0]], x[[0]]) - assert allclose(b[(0)], x[(0)]) - assert allclose(b[[1], [2]], x[[1], [2]]) - assert allclose(b[[1], 2], x[[1], 2]) - assert allclose(b[-1, -2], x[-1, -2]) - -def test_getitem_list(sc): - - x = arange(3*3*4).reshape((3, 3, 4)) - - b = array(x, sc, axis=0) - assert allclose(b[[0, 1], [0, 1], [0, 2]].toarray(), x[[0, 1], [0, 1], [0, 2]]) - assert allclose(b[[0, 1], [0, 2], [0, 3]].toarray(), x[[0, 1], [0, 2], [0, 3]]) - assert allclose(b[[0, 1, 2], [0, 2, 1], [0, 3, 1]].toarray(), x[[0, 1, 2], [0, 2, 1], [0, 3, 1]]) - - b = array(x, sc, axis=(0,1)) - assert allclose(b[[0, 1], [0, 1], [0, 2]].toarray(), x[[0, 1], [0, 1], [0, 2]]) - assert allclose(b[[0, 1], [0, 2], [0, 3]].toarray(), x[[0, 1], [0, 2], [0, 3]]) - assert allclose(b[[0, 1, 2], [0, 2, 1], [0, 3, 1]].toarray(), x[[0, 1, 2], [0, 2, 1], [0, 3, 1]]) - -def test_getitem_list_array(sc): - - x = arange(3*3*4).reshape((3, 3, 4)) - - rows = [[0, 0], [1, 1]] - cols = [[0, 2], [0, 2]] - dept = [[0, 3], [0, 3]] - - b = array(x, sc, axis=0) - assert allclose(b[rows, cols, dept].toarray(), x[rows, cols, dept]) - - b = array(x, sc, axis=(0, 1)) - assert allclose(b[rows, cols, dept].toarray(), x[rows, cols, dept]) - -def test_getitem_mixed(sc): - - x = arange(4*4*4*4).reshape(4, 4, 4, 4) - b = array(x, sc, axis=(0, 1)) - - i = [0, 1] - s = slice(1, 3) - assert allclose(b[i, :, :, :].toarray(), x[i, :, :, :]) - assert allclose(b[i, s, s, s].toarray(), x[i, s, s, s]) - assert allclose(b[:, :, i, :].toarray(), x[:, :, i, :]) - assert allclose(b[s, s, i, s].toarray(), x[s, s, i, s]) - - i = [1] - assert allclose(b[i, :, :, :].toarray(), x[i, :, :, :]) - assert allclose(b[:, :, i, :].toarray(), x[:, :, i, :]) - - i = [[0, 1], [1, 0]] - with pytest.raises(ValueError): - b[i, :, :, :] - -def test_bounds(sc): - - x = arange(5) - b = array(x, sc) - - # out of bounds - with pytest.raises(ValueError): - b[5] - - with pytest.raises(ValueError): - b[-6] - - with pytest.raises(ValueError): - b[[1,5]] - - # slicing that would produce an empty dimension - with pytest.raises(ValueError): - b[3:2] - - with pytest.raises(ValueError): - b[5:] - - with pytest.raises(ValueError): - b[-6:0] - -def test_squeeze(sc): - - from numpy import ones as npones - - x = npones((1, 2, 1, 4)) - b = ones((1, 2, 1, 4), sc, axis=0) - assert allclose(b.squeeze().toarray(), x.squeeze()) - assert allclose(b.squeeze((0, 2)).toarray(), x.squeeze((0, 2))) - assert allclose(b.squeeze(0).toarray(), x.squeeze(0)) - assert allclose(b.squeeze(2).toarray(), x.squeeze(2)) - assert b.squeeze().split == 0 - assert b.squeeze((0, 2)).split == 0 - assert b.squeeze(2).split == 1 - - x = npones((1, 2, 1, 4)) - b = ones((1, 2, 1, 4), sc, axis=(0, 1)) - assert allclose(b.squeeze().toarray(), x.squeeze()) - assert allclose(b.squeeze((0, 2)).toarray(), x.squeeze((0, 2))) - assert allclose(b.squeeze(0).toarray(), x.squeeze(0)) - assert allclose(b.squeeze(2).toarray(), x.squeeze(2)) - assert b.squeeze().split == 1 - assert b.squeeze((0, 2)).split == 1 - assert b.squeeze(2).split == 2 - - x = npones((1, 1, 1, 1)) - b = ones((1, 1, 1, 1), sc, axis=(0, 1)) - assert allclose(b.squeeze().toarray(), x.squeeze()) diff --git a/test/spark/test_spark_shaping.py b/test/spark/test_spark_shaping.py deleted file mode 100644 index 6c1690d..0000000 --- a/test/spark/test_spark_shaping.py +++ /dev/null @@ -1,247 +0,0 @@ -import pytest -from numpy import arange, prod -from itertools import permutations -from bolt import array -from bolt.utils import allclose - -def test_value_shape(sc): - - x = arange(2*3).reshape((2, 3)) - b = array(x, sc) - assert b.values.shape == (3,) - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=0) - assert b.values.shape == (3, 4) - -def test_key_shape(sc): - - x = arange(2*3).reshape((2, 3)) - b = array(x, sc) - assert b.keys.shape == (2,) - - x = arange(2*3*4).reshape((2, 3, 4)) - b = array(x, sc, axis=(0, 1)) - assert b.keys.shape == (2, 3) - -def test_reshape_keys(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0, 1)) - c = b.keys.reshape((3, 2)) - assert c.keys.shape == (3, 2) - assert allclose(c.toarray(), x.reshape((3, 2, 4))) - - b = array(x, sc, axis=0) - c = b.keys.reshape((2, 1)) - assert allclose(c.toarray(), x.reshape((2, 1, 3, 4))) - - b = array(x, sc, axis=(0,)) - c = b.keys.reshape((2,)) - assert allclose(c.toarray(), x) - - b = array(x, sc, axis=(0, 1)) - c = b.keys.reshape((2, 3)) - assert allclose(c.toarray(), x) - -def test_reshape_keys_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0, 1)) - with pytest.raises(ValueError): - b.keys.reshape((2, 3, 4)) - -def test_reshape_values(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0,)) - c = b.values.reshape((4, 3)) - assert c.values.shape == (4, 3) - assert allclose(c.toarray(), x.reshape((2, 4, 3))) - - b = array(x, sc, axis=(0, 1)) - c = b.values.reshape((1, 4)) - assert allclose(c.toarray(), x.reshape((2, 3, 1, 4))) - - b = array(x, sc, axis=(0, 1)) - c = b.values.reshape((4,)) - assert allclose(c.toarray(), x) - - b = array(x, sc, axis=0) - c = b.values.reshape((3, 4)) - assert allclose(c.toarray(), x) - -def test_reshape_values_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0, 1)) - with pytest.raises(ValueError): - b.values.reshape((2, 3, 4)) - -def test_transpose_keys(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0, 1)) - c = b.keys.transpose((1, 0)) - assert c.keys.shape == (3, 2) - assert allclose(c.toarray(), x.transpose((1, 0, 2))) - - b = array(x, sc, axis=0) - c = b.keys.transpose((0,)) - assert allclose(c.toarray(), x) - - b = array(x, sc, axis=(0, 1)) - c = b.keys.transpose((0, 1)) - assert allclose(c.toarray(), x) - -def test_transpose_keys_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=(0, 1)) - with pytest.raises(ValueError): - b.keys.transpose((0, 2)) - - with pytest.raises(ValueError): - b.keys.transpose((1, 1)) - - with pytest.raises(ValueError): - b.keys.transpose((0,)) - -def test_transpose_values(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=0) - c = b.values.transpose((1, 0)) - assert c.values.shape == (4, 3) - assert allclose(c.toarray(), x.transpose((0, 2, 1))) - - b = array(x, sc, axis=0) - c = b.values.transpose((0, 1)) - assert allclose(c.toarray(), x) - - b = array(x, sc, axis=(0, 1)) - c = b.values.transpose((0,)) - assert allclose(c.toarray(), x.reshape((2, 3, 4))) - -def test_traspose_values_errors(sc): - - x = arange(2*3*4).reshape((2, 3, 4)) - - b = array(x, sc, axis=0) - with pytest.raises(ValueError): - b.values.transpose((0, 2)) - - with pytest.raises(ValueError): - b.values.transpose((1, 1)) - - with pytest.raises(ValueError): - b.values.transpose((0,)) - - -def test_swap(sc): - - a = arange(2**8).reshape(*(8*[2])) - b = array(a, sc, axis=(0, 1, 2, 3)) - - bs = b.swap((1, 2), (0, 3), size=(2, 2)) - at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) - assert allclose(at, bs.toarray()) - - bs = b.swap((1, 2), (0, 3), size="50") - at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) - assert allclose(at, bs.toarray()) - - bs = b.swap((1, 2), (0, 3)) - at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) - assert allclose(at, bs.toarray()) - - bs = b.swap((), (0, 1, 2, 3)) - at = a - assert allclose(at, bs.toarray()) - - bs = b.swap(0, 0) - at = a.transpose((1, 2, 3, 4, 0, 5, 6, 7)) - assert allclose(at, bs.toarray()) - - bs = b.swap([], 0) - at = a.transpose((0, 1, 2, 3, 4, 5, 6, 7)) - assert allclose(at, bs.toarray()) - assert bs.split == 5 - - bs = b.swap(0, []) - at = a.transpose((1, 2, 3, 0, 4, 5, 6, 7)) - assert allclose(at, bs.toarray()) - assert bs.split == 3 - - b = array(a, sc, axis=range(8)) - bs = b.swap([0,1], []) - at = a.transpose((2, 3, 4, 5, 6, 7, 0, 1)) - assert allclose(at, bs.toarray()) - assert bs.split == 6 - - a = arange(2*3*4).reshape(2, 3, 4) - b = array(a, sc, axis=(0,)) - - bs = b.swap((0,), (0, 1)) - at = a.transpose(1, 2, 0) - assert allclose(at, bs.toarray()) - - -def test_transpose(sc): - - n = 4 - perms = list(permutations(range(n), n)) - - a = arange(2*3*4*5).reshape((2, 3, 4, 5)) - - b = array(a, sc, axis=(0, 1)) - for p in perms: - assert allclose(b.transpose(p).toarray(), b.toarray().transpose(p)) - - assert allclose(b.transpose(), b.toarray().transpose()) - -def test_t(sc): - - a = arange(2*3*4*5).reshape((2, 3, 4, 5)) - - b = array(a, sc, axis=0) - assert allclose(b.T.toarray(), b.toarray().T) - - b = array(a, sc, axis=(0, 1)) - assert allclose(b.T.toarray(), b.toarray().T) - -def test_swapaxes(sc): - - a = arange(2*3*4*5).reshape((2, 3, 4, 5)) - - b = array(a, sc, axis=(0, 1)) - assert allclose(b.swapaxes(1, 2).toarray(), b.toarray().swapaxes(1, 2)) - assert allclose(b.swapaxes(0, 1).toarray(), b.toarray().swapaxes(0, 1)) - assert allclose(b.swapaxes(2, 3).toarray(), b.toarray().swapaxes(2, 3)) - -def test_reshape(sc): - - old_shape = (6, 10, 4, 12) - a = arange(prod(old_shape)).reshape(old_shape) - b = array(a, sc, axis=(0, 1)) - - # keys only - new_shape = (15, 4, 4, 12) - assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) - # values only - new_shape = (6, 10, 24, 2) - assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) - # keys and values, independent - new_shape = (15, 4, 24, 2) - assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) - # keys and values, mixing - new_shape = (6, 4, 10, 12) - with pytest.raises(NotImplementedError): - b.reshape(new_shape) diff --git a/test/spark/test_spark_stacking.py b/test/spark/test_spark_stacking.py deleted file mode 100644 index c57cfe4..0000000 --- a/test/spark/test_spark_stacking.py +++ /dev/null @@ -1,133 +0,0 @@ -import pytest -from numpy import arange, repeat, asarray, vstack, tile -from bolt import array, ones -from bolt.utils import allclose -from bolt.spark.array import BoltArraySpark - - -def _2D_stackable_preamble(sc, num_partitions=2): - - dims = (10, 10) - arr = vstack([[x]*dims[1] for x in arange(dims[0])]) - barr = array(arr, sc, axis=0) - barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), - shape=barr.shape, split=barr.split) - return barr - -def _3D_stackable_preamble(sc, num_partitions=2): - - dims = (10, 10, 10) - area = dims[0] * dims[1] - arr = asarray([repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) - barr = array(arr, sc, axis=0) - barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), - shape=barr.shape, split=barr.split) - return barr - -def test_stack_2D(sc): - - barr = _2D_stackable_preamble(sc) - - # without stack_size - stacked = barr.stack() - first_partition = stacked._rdd.first()[1] - assert first_partition.shape == (5, 10) - assert stacked.shape == (10, 10) - - # with stack_size - stacked = barr.stack(size=2) - first_partition = stacked._rdd.first()[1] - assert first_partition.shape == (2, 10) - - # invalid stack_size - stacked = barr.stack(size=0) - first_partition = stacked._rdd.first()[1] - assert first_partition.shape == (5, 10) - - # unstacking - unstacked = stacked.unstack() - arr = unstacked.toarray() - assert arr.shape == (10, 10) - assert allclose(arr, barr.toarray()) - -def test_stack_3D(sc): - - barr = _3D_stackable_preamble(sc) - - # with stack_size - stacked = barr.stack(size=2) - first_partition = stacked._rdd.first()[1] - assert first_partition.shape == (2, 10, 10) - - # invalid stack_size - stacked = barr.stack(size=0) - first_partition = stacked._rdd.first()[1] - assert first_partition.shape == (5, 10, 10) - - # unstacking - unstacked = stacked.unstack() - arr = unstacked.toarray() - assert arr.shape == (10, 10, 10) - assert allclose(arr, barr.toarray()) - -def test_stacked_map(sc): - - barr = _2D_stackable_preamble(sc) - - map_func1 = lambda x: x * 2 - - funcs = [map_func1] - - for func in funcs: - stacked = barr.stack() - stacked_map = stacked.map(func) - normal_map = barr.map(func) - unstacked = stacked_map.unstack() - assert normal_map.shape == unstacked.shape - assert normal_map.split == unstacked.split - assert allclose(normal_map.toarray(), unstacked.toarray()) - -def test_stacked_shape_inference(sc): - - from numpy import ones as npones - - a = ones((100, 2), sc) - a._rdd = a._rdd.partitionBy(2) - s = a.stack(5) - n = s.tordd().count() - - # operations that preserve keys - assert s.map(lambda x: x * 2).unstack().shape == (100, 2) - assert s.map(lambda x: x.sum(axis=1)).unstack().shape == (100,) - assert s.map(lambda x: tile(x, (1, 2))).unstack().shape == (100, 4) - - # operations that create new keys - assert s.map(lambda x: npones((2, 2))).unstack().shape == (n, 2, 2) - assert s.map(lambda x: x.sum(axis=0)).unstack().shape == (n, 2) - assert s.map(lambda x: asarray([2])).unstack().toarray().shape == (n, 1) - assert s.map(lambda x: asarray(2)).unstack().toarray().shape == (n,) - - # composing functions works - assert s.map(lambda x: x * 2).map(lambda x: x * 2).unstack().shape == (100, 2) - assert s.map(lambda x: x * 2).map(lambda x: npones((2, 2))).unstack().shape == (n, 2, 2) - assert s.map(lambda x: npones((2, 2))).map(lambda x: x * 2).unstack().shape == (n, 2, 2) - - # check the result - assert allclose(s.map(lambda x: x.sum(axis=1)).unstack().toarray(), npones(100) * 2) - assert allclose(s.map(lambda x: tile(x, (1, 2))).unstack().toarray(), npones((100, 4))) - - with pytest.raises(ValueError): - s.map(lambda x: 2) - - with pytest.raises(ValueError): - s.map(lambda x: None) - - with pytest.raises(RuntimeError): - s.map(lambda x: 1/0) - -def test_stacked_conversion(sc): - - from pyspark import RDD - barr = _2D_stackable_preamble(sc) - k1 = barr.tordd().keys() - assert isinstance(k1, RDD) \ No newline at end of file From e4209fd22b348b2d3a501d521f14f6677cf9292c Mon Sep 17 00:00:00 2001 From: jwittenbach Date: Fri, 6 Jan 2017 12:55:52 -0500 Subject: [PATCH 2/4] removes old files; adds new files --- bolt/array/__init__.py | 0 bolt/array/array.py | 1017 +++++++++++++++++++++++++++++++++ bolt/array/base.py | 158 +++++ bolt/array/chunk.py | 677 ++++++++++++++++++++++ bolt/array/construct.py | 212 +++++++ bolt/array/shapes.py | 167 ++++++ bolt/array/stack.py | 154 +++++ bolt/array/statcounter.py | 130 +++++ bolt/array/utils.py | 31 + test/test_spark_basic.py | 160 ++++++ test/test_spark_chunking.py | 208 +++++++ test/test_spark_construct.py | 96 ++++ test/test_spark_functional.py | 118 ++++ test/test_spark_getting.py | 170 ++++++ test/test_spark_shaping.py | 247 ++++++++ test/test_spark_stacking.py | 133 +++++ 16 files changed, 3678 insertions(+) create mode 100644 bolt/array/__init__.py create mode 100644 bolt/array/array.py create mode 100644 bolt/array/base.py create mode 100644 bolt/array/chunk.py create mode 100644 bolt/array/construct.py create mode 100644 bolt/array/shapes.py create mode 100644 bolt/array/stack.py create mode 100644 bolt/array/statcounter.py create mode 100644 bolt/array/utils.py create mode 100644 test/test_spark_basic.py create mode 100644 test/test_spark_chunking.py create mode 100644 test/test_spark_construct.py create mode 100644 test/test_spark_functional.py create mode 100644 test/test_spark_getting.py create mode 100644 test/test_spark_shaping.py create mode 100644 test/test_spark_stacking.py diff --git a/bolt/array/__init__.py b/bolt/array/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bolt/array/array.py b/bolt/array/array.py new file mode 100644 index 0000000..8915b60 --- /dev/null +++ b/bolt/array/array.py @@ -0,0 +1,1017 @@ +from __future__ import print_function +from numpy import asarray, unravel_index, prod, mod, ndarray, ceil, where, \ + r_, sort, argsort, array, random, arange, ones, expand_dims, sum +from itertools import groupby + +from bolt.array.base import BoltArray +from bolt.array.stack import StackedArray +from bolt.array.utils import zip_with_index +from bolt.array.statcounter import StatCounter +from bolt.utils import slicify, listify, tupleize, argpack, inshape, istransposeable, isreshapeable + + +class BoltArraySpark(BoltArray): + + _metadata = { + '_shape': None, + '_split': None, + '_dtype': None, + '_ordered': True + } + + def __init__(self, rdd, shape=None, split=None, dtype=None, ordered=True): + self._rdd = rdd + self._shape = shape + self._split = split + self._dtype = dtype + self._mode = 'spark' + self._ordered = ordered + + @property + def _constructor(self): + return BoltArraySpark + + def __array__(self): + return self.toarray() + + def cache(self): + """ + Cache the underlying RDD in memory. + """ + self._rdd.cache() + + def unpersist(self): + """ + Remove the underlying RDD from memory. + """ + self._rdd.unpersist() + + def repartition(self, npartitions): + """ + Repartitions the underlying RDD + + Parameters + ---------- + npartitions : int + Number of partitions to repartion the underlying RDD to + """ + + rdd = self._rdd.repartition(npartitions) + return self._constructor(rdd, ordered=False).__finalize__(self) + + def stack(self, size=None): + """ + Aggregates records of a distributed array. + + Stacking should improve the performance of vectorized operations, + but the resulting StackedArray object only exposes a restricted set + of operations (e.g. map, reduce). The unstack method can be used + to restore the full bolt array. + + Parameters + ---------- + size : int, optional, default=None + The maximum size for each stack (number of original records), + will aggregate groups of records per partition up to this size, + if None will aggregate all records on each partition. + + Returns + ------- + StackedArray + """ + stk = StackedArray(self._rdd, shape=self.shape, split=self.split) + return stk.stack(size) + + def _align(self, axis): + """ + Align spark bolt array so that axes for iteration are in the keys. + + This operation is applied before most functional operators. + It ensures that the specified axes are valid, and swaps + key/value axes so that functional operators can be applied + over the correct records. + + Parameters + ---------- + axis: tuple[int] + One or more axes that wil be iterated over by a functional operator + + Returns + ------- + BoltArraySpark + """ + # ensure that the specified axes are valid + inshape(self.shape, axis) + + # find the value axes that should be moved into the keys (axis >= split) + tokeys = [(a - self.split) for a in axis if a >= self.split] + + # find the key axes that should be moved into the values (axis < split) + tovalues = [a for a in range(self.split) if a not in axis] + + if tokeys or tovalues: + return self.swap(tovalues, tokeys) + else: + return self + + def first(self): + """ + Return the first element of an array + """ + from bolt.local.array import BoltArrayLocal + rdd = self._rdd if self._ordered else self._rdd.sortByKey() + return BoltArrayLocal(rdd.values().first()) + + def map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False): + """ + Apply a function across an axis. + + Array will be aligned so that the desired set of axes + are in the keys, which may incur a swap. + + Parameters + ---------- + func : function + Function of a single array to apply. If with_keys=True, + function should be of a (tuple, array) pair. + + axis : tuple or int, optional, default=(0,) + Axis or multiple axes to apply function along. + + value_shape : tuple, optional, default=None + Known shape of values resulting from operation + + dtype: numpy.dtype, optional, default=None + Known dtype of values resulting from operation + + with_keys : bool, optional, default=False + Include keys as an argument to the function + + Returns + ------- + BoltArraySpark + """ + axis = tupleize(axis) + swapped = self._align(axis) + + if with_keys: + test_func = lambda x: func(((0,), x)) + else: + test_func = func + + if value_shape is None or dtype is None: + # try to compute the size of each mapped element by applying func to a random array + try: + mapped = test_func(random.randn(*swapped.values.shape).astype(self.dtype)) + except Exception: + first = swapped._rdd.first() + if first: + # eval func on the first element + mapped = test_func(first[1]) + if value_shape is None: + value_shape = mapped.shape + if dtype is None: + dtype = mapped.dtype + + shape = tuple([swapped._shape[ax] for ax in range(len(axis))]) + tupleize(value_shape) + + if with_keys: + rdd = swapped._rdd.map(lambda kv: (kv[0], func(kv))) + else: + rdd = swapped._rdd.mapValues(func) + + # reshaping will fail if the elements aren't uniformly shaped + def check(v): + if len(v.shape) > 0 and v.shape != tupleize(value_shape): + raise Exception("Map operation did not produce values of uniform shape.") + return v + + rdd = rdd.mapValues(lambda v: check(v)) + + return self._constructor(rdd, shape=shape, dtype=dtype, split=swapped.split).__finalize__(swapped) + + def filter(self, func, axis=(0,), sort=False): + """ + Filter array along an axis. + + Applies a function which should evaluate to boolean, + along a single axis or multiple axes. Array will be + aligned so that the desired set of axes are in the keys, + which may incur a swap. + + Parameters + ---------- + func : function + Function to apply, should return boolean + + axis : tuple or int, optional, default=(0,) + Axis or multiple axes to filter along. + + sort: bool, optional, default=False + Whether or not to sort by key before reindexing + + Returns + ------- + BoltArraySpark + """ + axis = tupleize(axis) + + swapped = self._align(axis) + def f(record): + return func(record[1]) + rdd = swapped._rdd.filter(f) + if sort: + rdd = rdd.sortByKey().values() + else: + rdd = rdd.values() + + # count the resulting array in order to reindex (linearize) the keys + count, zipped = zip_with_index(rdd) + if not count: + count = zipped.count() + reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0])) + + # since we can only filter over one axis, the remaining shape is always the following + remaining = list(swapped.shape[len(axis):]) + if count != 0: + shape = tuple([count] + remaining) + else: + shape = (0,) + + return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped) + + def reduce(self, func, axis=(0,), keepdims=False): + """ + Reduce an array along an axis. + + Applies a commutative/associative function of two + arguments cumulatively to all arrays along an axis. + Array will be aligned so that the desired set of axes + are in the keys, which may incur a swap. + + Parameters + ---------- + func : function + Function of two arrays that returns a single array + + axis : tuple or int, optional, default=(0,) + Axis or multiple axes to reduce along. + + Returns + ------- + BoltArraySpark + """ + from numpy import ndarray + + axis = tupleize(axis) + swapped = self._align(axis) + arr = swapped._rdd.values().treeReduce(func, depth=3) + + if keepdims: + for i in axis: + arr = expand_dims(arr, axis=i) + + if not isinstance(arr, ndarray): + # the result of a reduce can also be a scalar + return arr + elif arr.shape == (1,): + # ndarrays with single values in them should be converted into scalars + return arr[0] + + return arr + + def _stat(self, axis=None, func=None, name=None, keepdims=False): + """ + Compute a statistic over an axis. + + Can provide either a function (for use in a reduce) + or a name (for use by a stat counter). + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + func : function, optional, default=None + Function for reduce, see BoltArraySpark.reduce + + name : str + A named statistic, see StatCounter + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + if axis is None: + axis = list(range(len(self.shape))) + axis = tupleize(axis) + + if func and not name: + return self.reduce(func, axis, keepdims) + + if name and not func: + swapped = self._align(axis) + + def reducer(left, right): + return left.combine(right) + + counter = swapped._rdd.values()\ + .mapPartitions(lambda i: [StatCounter(values=i, stats=name)])\ + .treeReduce(reducer, depth=3) + + arr = getattr(counter, name) + + if keepdims: + for i in axis: + arr = expand_dims(arr, axis=i) + + return arr + + else: + raise ValueError('Must specify either a function or a statistic name.') + + def mean(self, axis=None, keepdims=False): + """ + Return the mean of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + return self._stat(axis, name='mean', keepdims=keepdims) + + def var(self, axis=None, keepdims=False): + """ + Return the variance of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + return self._stat(axis, name='variance', keepdims=keepdims) + + def std(self, axis=None, keepdims=False): + """ + Return the standard deviation of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + return self._stat(axis, name='stdev', keepdims=keepdims) + + def sum(self, axis=None, keepdims=False): + """ + Return the sum of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + from operator import add + return self._stat(axis, func=add, keepdims=keepdims) + + def max(self, axis=None, keepdims=False): + """ + Return the maximum of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + from numpy import maximum + return self._stat(axis, func=maximum, keepdims=keepdims) + + def min(self, axis=None, keepdims=False): + """ + Return the minimum of the array over the given axis. + + Parameters + ---------- + axis : tuple or int, optional, default=None + Axis to compute statistic over, if None + will compute over all axes + + keepdims : boolean, optional, default=False + Keep axis remaining after operation with size 1. + """ + from numpy import minimum + return self._stat(axis, func=minimum, keepdims=keepdims) + + def concatenate(self, arry, axis=0): + """ + Join this array with another array. + + Paramters + --------- + arry : ndarray, BoltArrayLocal, or BoltArraySpark + Another array to concatenate with + + axis : int, optional, default=0 + The axis along which arrays will be joined. + + Returns + ------- + BoltArraySpark + """ + if isinstance(arry, ndarray): + from bolt.array.construct import array + arry = array(arry, self._rdd.context, axis=range(0, self.split)) + else: + if not isinstance(arry, BoltArraySpark): + raise ValueError("other must be local array or spark array, got %s" % type(arry)) + + if not all([x == y if not i == axis else True + for i, (x, y) in enumerate(zip(self.shape, arry.shape))]): + raise ValueError("all the input array dimensions except for " + "the concatenation axis must match exactly") + + if not self.split == arry.split: + raise NotImplementedError("two arrays must have the same split ") + + if axis < self.split: + shape = self.keys.shape + + def key_func(key): + key = list(key) + key[axis] += shape[axis] + return tuple(key) + + rdd = self._rdd.union(arry._rdd.map(lambda kv: (key_func(kv[0]), kv[1]))) + + else: + from numpy import concatenate as npconcatenate + shift = axis - self.split + rdd = self._rdd.join(arry._rdd).map(lambda kv: (kv[0], npconcatenate(kv[1], axis=shift))) + + shape = tuple([x + y if i == axis else x + for i, (x, y) in enumerate(zip(self.shape, arry.shape))]) + + return self._constructor(rdd, shape=shape, ordered=False).__finalize__(self) + + def _getbasic(self, index): + """ + Basic indexing (for slices or ints). + """ + key_slices = index[0:self.split] + value_slices = index[self.split:] + + def key_check(key): + def inrange(k, s): + if s.step > 0: + return s.start <= k < s.stop + else: + return s.stop < k <= s.start + def check(k, s): + return inrange(k, s) and mod(k - s.start, s.step) == 0 + out = [check(k, s) for k, s in zip(key, key_slices)] + return all(out) + + def key_func(key): + return tuple([(k - s.start)/s.step for k, s in zip(key, key_slices)]) + + filtered = self._rdd.filter(lambda kv: key_check(kv[0])) + + if self._split == self.ndim: + rdd = filtered.map(lambda kv: (key_func(kv[0]), kv[1])) + else: + # handle use of use slice.stop = -1 for a special case (see utils.slicify) + value_slices = [s if s.stop != -1 else slice(s.start, None, s.step) for s in value_slices] + rdd = filtered.map(lambda kv: (key_func(kv[0]), kv[1][value_slices])) + + shape = tuple([int(ceil((s.stop - s.start) / float(s.step))) for s in index]) + split = self.split + return rdd, shape, split + + def _getadvanced(self, index): + """ + Advanced indexing (for sets, lists, or ndarrays). + """ + index = [asarray(i) for i in index] + shape = index[0].shape + if not all([i.shape == shape for i in index]): + raise ValueError("shape mismatch: indexing arrays could not be broadcast " + "together with shapes " + ("%s " * self.ndim) + % tuple([i.shape for i in index])) + + index = tuple([listify(i, d) for (i, d) in zip(index, self.shape)]) + + # build tuples with target indices + key_tuples = list(zip(*index[0:self.split])) + value_tuples = list(zip(*index[self.split:])) + + # build dictionary to look up targets in values + d = {} + for k, g in groupby(zip(value_tuples, key_tuples), lambda x: x[1]): + d[k] = map(lambda x: x[0], list(g)) + + def key_check(key): + return key in key_tuples + + def key_func(key): + return unravel_index(key, shape) + + # filter records based on key targets + filtered = self._rdd.filter(lambda kv: key_check(kv[0])) + + # subselect and flatten records based on value targets (if they exist) + if len(value_tuples) > 0: + flattened = filtered.flatMap(lambda kv: [(kv[0], kv[1][i]) for i in d[kv[0]]]) + else: + flattened = filtered + + # reindex + indexed = flattened.zipWithIndex() + rdd = indexed.map(lambda kkv: (key_func(kkv[1]), kkv[0][1])) + split = len(shape) + + return rdd, shape, split + + def _getmixed(self, index): + """ + Mixed indexing (combines basic and advanced indexes) + + Assumes that only a single advanced index is used, due to the complicated + behavior needed to be compatible with NumPy otherwise. + """ + # find the single advanced index + loc = where([isinstance(i, (tuple, list, ndarray)) for i in index])[0][0] + idx = list(index[loc]) + + if isinstance(idx[0], (tuple, list, ndarray)): + raise ValueError("When mixing basic and advanced indexing, " + "advanced index must be one-dimensional") + + # single advanced index is on a key -- filter and update key + if loc < self.split: + def newkey(key): + newkey = list(key) + newkey[loc] = idx.index(key[loc]) + return tuple(newkey) + rdd = self._rdd.filter(lambda kv: kv[0][loc] in idx).map(lambda kv: (newkey(kv[0]), kv[1])) + # single advanced index is on a value -- use NumPy indexing + else: + slices = [slice(0, None, None) for _ in self.values.shape] + slices[loc - self.split] = idx + rdd = self._rdd.map(lambda kv: (kv[0], kv[1][slices])) + newshape = list(self.shape) + newshape[loc] = len(idx) + barray = self._constructor(rdd, shape=tuple(newshape)).__finalize__(self) + + # apply the rest of the simple indices + new_index = index[:] + new_index[loc] = slice(0, None, None) + barray = barray[tuple(new_index)] + return barray._rdd, barray.shape, barray.split + + def __getitem__(self, index): + """ + Get an item from the array through indexing. + + Supports basic indexing with slices and ints, or advanced + indexing with lists or ndarrays of integers. + Mixing basic and advanced indexing across axes is currently supported + only for a single advanced index amidst multiple basic indices. + + Parameters + ---------- + index : tuple of slices, ints, list, tuple, or ndarrays + One or more index specifications + + Returns + ------- + BoltSparkArray + """ + if isinstance(index, tuple): + index = list(index) + else: + index = [index] + int_locs = where([isinstance(i, int) for i in index])[0] + + if len(index) > self.ndim: + raise ValueError("Too many indices for array") + + if not all([isinstance(i, (slice, int, list, tuple, ndarray)) for i in index]): + raise ValueError("Each index must either be a slice, int, list, set, or ndarray") + + # fill unspecified axes with full slices + if len(index) < self.ndim: + index += tuple([slice(0, None, None) for _ in range(self.ndim - len(index))]) + + # standardize slices and bounds checking + for n, idx in enumerate(index): + size = self.shape[n] + if isinstance(idx, (slice, int)): + slc = slicify(idx, size) + # throw an error if this would lead to an empty dimension in numpy + if slc.step > 0: + minval, maxval = slc.start, slc.stop + else: + minval, maxval = slc.stop, slc.start + if minval > size-1 or maxval < 1 or minval >= maxval: + raise ValueError("Index {} in dimension {} with shape {} would " + "produce an empty dimension".format(idx, n, size)) + index[n] = slc + else: + adjusted = array(idx) + inds = where(adjusted<0) + adjusted[inds] += size + if adjusted.min() < 0 or adjusted.max() > size-1: + raise ValueError("Index {} out of bounds in dimension {} with " + "shape {}".format(idx, n, size)) + index[n] = adjusted + + # select basic or advanced indexing + if all([isinstance(i, slice) for i in index]): + rdd, shape, split = self._getbasic(index) + elif all([isinstance(i, (tuple, list, ndarray)) for i in index]): + rdd, shape, split = self._getadvanced(index) + elif sum([isinstance(i, (tuple, list, ndarray)) for i in index]) == 1: + rdd, shape, split = self._getmixed(index) + else: + raise NotImplementedError("When mixing basic indexing (slices and int) with " + "with advanced indexing (lists, tuples, and ndarrays), " + "can only have a single advanced index") + + # if any key indices used negative steps, records are no longer ordered + if self._ordered is False or any([isinstance(s, slice) and s.step<0 for s in index[:self.split]]): + ordered = False + else: + ordered = True + + result = self._constructor(rdd, shape=shape, split=split, ordered=ordered).__finalize__(self) + + # squeeze out int dimensions (and squeeze to singletons if all ints) + if len(int_locs) == self.ndim: + return result.squeeze().toarray()[()] + else: + return result.squeeze(tuple(int_locs)) + + def chunk(self, size="150", axis=None, padding=None): + """ + Chunks records of a distributed array. + + Chunking breaks arrays into subarrays, using an specified + size of chunks along each value dimension. Can alternatively + specify an average chunk byte size (in kilobytes) and the size of + chunks (as ints) will be computed automatically. + + Parameters + ---------- + size : tuple, int, or str, optional, default = "150" + A string giving the size in kilobytes, or a tuple with the size + of chunks along each dimension. + + axis : int or tuple, optional, default = None + One or more axis to chunk array along, if None + will use all axes, + + padding: tuple or int, default = None + Number of elements per dimension that will overlap with the adjacent chunk. + If a tuple, specifies padding along each chunked dimension; if a int, same + padding will be applied to all chunked dimensions. + + Returns + ------- + ChunkedArray + """ + if type(size) is not str: + size = tupleize((size)) + axis = tupleize((axis)) + padding = tupleize((padding)) + + from bolt.array.chunk import ChunkedArray + + chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) + return chnk._chunk(size, axis, padding) + + def swap(self, kaxes, vaxes, size="150"): + """ + Swap axes from keys to values. + + This is the core operation underlying shape manipulation + on the Spark bolt array. It exchanges an arbitrary set of axes + between the keys and the valeus. If either is None, will only + move axes in one direction (from keys to values, or values to keys). + Keys moved to values will be placed immediately after the split; + values moved to keys will be placed immediately before the split. + + Parameters + ---------- + kaxes : tuple + Axes from keys to move to values + + vaxes : tuple + Axes from values to move to keys + + size : tuple or int, optional, default = "150" + Can either provide a string giving the size in kilobytes, + or a tuple with the number of chunks along each + value dimension being moved + + Returns + ------- + BoltArraySpark + """ + kaxes = asarray(tupleize(kaxes), 'int') + vaxes = asarray(tupleize(vaxes), 'int') + if type(size) is not str: + size = tupleize(size) + + if len(kaxes) == self.keys.ndim and len(vaxes) == 0: + raise ValueError('Cannot perform a swap that would ' + 'end up with all data on a single key') + + if len(kaxes) == 0 and len(vaxes) == 0: + return self + + from bolt.array.chunk import ChunkedArray + + chunks = self.chunk(size) + + swapped = chunks.keys_to_values(kaxes).values_to_keys([v+len(kaxes) for v in vaxes]) + barray = swapped.unchunk() + + return barray + + def transpose(self, *axes): + """ + Return an array with the axes transposed. + + This operation will incur a swap unless the + desiured permutation can be obtained + only by transpoing the keys or the values. + + Parameters + ---------- + axes : None, tuple of ints, or n ints + If None, will reverse axis order. + """ + if len(axes) == 0: + p = arange(self.ndim-1, -1, -1) + else: + p = asarray(argpack(axes)) + + istransposeable(p, range(self.ndim)) + + split = self.split + + # compute the keys/value axes that need to be swapped + new_keys, new_values = p[:split], p[split:] + swapping_keys = sort(new_values[new_values < split]) + swapping_values = sort(new_keys[new_keys >= split]) + stationary_keys = sort(new_keys[new_keys < split]) + stationary_values = sort(new_values[new_values >= split]) + + # compute the permutation that the swap causes + p_swap = r_[stationary_keys, swapping_values, swapping_keys, stationary_values] + + # compute the extra permutation (p_x) on top of this that + # needs to happen to get the full permutation desired + p_swap_inv = argsort(p_swap) + p_x = p_swap_inv[p] + p_keys, p_values = p_x[:split], p_x[split:]-split + + # perform the swap and the the within key/value permutations + arr = self.swap(swapping_keys, swapping_values-split) + arr = arr.keys.transpose(tuple(p_keys.tolist())) + arr = arr.values.transpose(tuple(p_values.tolist())) + + return arr + + @property + def T(self): + """ + Transpose by reversing the order of the axes. + """ + return self.transpose() + + def swapaxes(self, axis1, axis2): + """ + Return the array with two axes interchanged. + + Parameters + ---------- + axis1 : int + The first axis to swap + + axis2 : int + The second axis to swap + """ + p = list(range(self.ndim)) + p[axis1] = axis2 + p[axis2] = axis1 + + return self.transpose(p) + + def reshape(self, *shape): + """ + Return an array with the same data but a new shape. + + Currently only supports reshaping that independently + reshapes the keys, or the values, or both. + + Parameters + ---------- + shape : tuple of ints, or n ints + New shape + """ + new = argpack(shape) + isreshapeable(new, self.shape) + + if new == self.shape: + return self + + i = self._reshapebasic(new) + if i == -1: + raise NotImplementedError("Currently no support for reshaping between " + "keys and values for BoltArraySpark") + else: + new_key_shape, new_value_shape = new[:i], new[i:] + return self.keys.reshape(new_key_shape).values.reshape(new_value_shape) + + def _reshapebasic(self, shape): + """ + Check if the requested reshape can be broken into independant reshapes + on the keys and values. If it can, returns the index in the new shape + separating keys from values, otherwise returns -1 + """ + new = tupleize(shape) + old_key_size = prod(self.keys.shape) + old_value_size = prod(self.values.shape) + + for i in range(len(new)): + new_key_size = prod(new[:i]) + new_value_size = prod(new[i:]) + if new_key_size == old_key_size and new_value_size == old_value_size: + return i + + return -1 + + def squeeze(self, axis=None): + """ + Remove one or more single-dimensional axes from the array. + + Parameters + ---------- + axis : tuple or int + One or more singleton axes to remove. + """ + if not any([d == 1 for d in self.shape]): + return self + + if axis is None: + drop = where(asarray(self.shape) == 1)[0] + elif isinstance(axis, int): + drop = asarray((axis,)) + elif isinstance(axis, tuple): + drop = asarray(axis) + else: + raise ValueError("an integer or tuple is required for the axis") + + if any([self.shape[i] > 1 for i in drop]): + raise ValueError("cannot select an axis to squeeze out which has size greater than one") + + if any(asarray(drop) < self.split): + kmask = set([d for d in drop if d < self.split]) + kfunc = lambda k: tuple([kk for ii, kk in enumerate(k) if ii not in kmask]) + else: + kfunc = lambda k: k + + if any(asarray(drop) >= self.split): + vmask = tuple([d - self.split for d in drop if d >= self.split]) + vfunc = lambda v: v.squeeze(vmask) + else: + vfunc = lambda v: v + + rdd = self._rdd.map(lambda kv: (kfunc(kv[0]), vfunc(kv[1]))) + shape = tuple([ss for ii, ss in enumerate(self.shape) if ii not in drop]) + split = len([d for d in range(self.keys.ndim) if d not in drop]) + return self._constructor(rdd, shape=shape, split=split).__finalize__(self) + + def astype(self, dtype, casting='unsafe'): + """ + Cast the array to a specified type. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to cast the array to (see numpy) + """ + rdd = self._rdd.mapValues(lambda v: v.astype(dtype, 'K', casting)) + return self._constructor(rdd, dtype=dtype).__finalize__(self) + + def clip(self, min=None, max=None): + """ + Clip values above and below. + + Parameters + ---------- + min : scalar or array-like + Minimum value. If array, will be broadcasted + + max : scalar or array-like + Maximum value. If array, will be broadcasted. + """ + rdd = self._rdd.mapValues(lambda v: v.clip(min=min, max=max)) + return self._constructor(rdd).__finalize__(self) + + @property + def shape(self): + """ + Size of each dimension. + """ + return self._shape + + @property + def size(self): + """ + Total number of elements. + """ + return prod(self._shape) + + @property + def ndim(self): + """ + Number of dimensions. + """ + return len(self._shape) + + @property + def split(self): + """ + Axis at which the array is split into keys/values. + """ + return self._split + + @property + def dtype(self): + """ + Data-type of array. + """ + return self._dtype + + @property + def mask(self): + return tuple([1] * len(self.keys.shape) + [0] * len(self.values.shape)) + + @property + def keys(self): + """ + Returns a restricted keys. + """ + from bolt.array.shapes import Keys + return Keys(self) + + @property + def values(self): + from bolt.array.shapes import Values + return Values(self) + + def toarray(self): + """ + Returns the contents as a local array. + + Will likely cause memory problems for large objects. + """ + rdd = self._rdd if self._ordered else self._rdd.sortByKey() + x = rdd.values().collect() + return asarray(x).reshape(self.shape) + + def tordd(self): + """ + Return the underlying RDD of the bolt array. + """ + return self._rdd + + def display(self): + """ + Show a pretty-printed representation of this BoltArrayLocal. + """ + for x in self._rdd.take(10): + print(x) diff --git a/bolt/array/base.py b/bolt/array/base.py new file mode 100644 index 0000000..240d926 --- /dev/null +++ b/bolt/array/base.py @@ -0,0 +1,158 @@ +class BoltArray(object): + + _mode = None + _metadata = {} + + def __finalize__(self, other): + if isinstance(other, BoltArray): + for name in self._metadata: + other_attr = getattr(other, name, None) + if (other_attr is not self._metadata[name]) \ + and (getattr(self, name, None) is self._metadata[name]): + object.__setattr__(self, name, other_attr) + return self + + @property + def mode(self): + return self._mode + + @property + def shape(self): + """ + Size of each dimension. + """ + raise NotImplementedError + + @property + def size(self): + """ + Total number of elements. + """ + raise NotImplementedError + + @property + def ndim(self): + """ + Number of dimensions. + """ + raise NotImplementedError + + @property + def dtype(self): + """ + Data-type of array. + """ + raise NotImplementedError + + @property + def _constructor(self): + return None + + def sum(self, axis): + """ + Return the sum of the array elements over the given axis. + """ + raise NotImplementedError + + def mean(self, axis): + """ + Return the mean of the array elements over the given axis. + """ + raise NotImplementedError + + def var(self, axis): + """ + Return the variance of the array elements over the given axis. + """ + raise NotImplementedError + + def std(self, axis): + """ + Return the standard deviation of the array elements over the given axis. + """ + raise NotImplementedError + + def min(self, axis): + """ + Return the minimum of the array elements over the given axis or axes. + """ + raise NotImplementedError + + def max(self, axis): + """ + Return the maximum of the array elements over the given axis or axes. + """ + raise NotImplementedError + + def concatenate(self, arry, axis): + raise NotImplementedError + + def transpose(self, axis): + """ + Return an array with the axes transposed. + """ + raise NotImplementedError + + @property + def T(self): + """ + Transpose by reversing the order of the axes. + """ + raise NotImplementedError + + def reshape(self, axis): + """ + Return an array with the same data but a new shape. + """ + raise NotImplementedError + + def squeeze(self, axis): + """ + Remove one or more single-dimensional axes from the array. + """ + raise NotImplementedError + + def swapaxes(self, axis1, axis2): + """ + Return an array with two axes interchanged. + """ + raise NotImplementedError + + def astype(self, dtype, casting): + """ + Cast the array to a specified type. + """ + raise NotImplementedError + + def __getitem__(self, index): + raise NotImplementedError + + def map(self, func, axis): + """ + Apply a function across one or more axes. + """ + raise NotImplementedError + + def reduce(self, func, axis, keepdims): + """ + Reduce an array across one or more axes. + """ + raise NotImplementedError + + def filter(self, func, axis): + """ + Filter an array across one or more axes. + """ + raise NotImplementedError + + def first(self): + """ + Return the first element of the array + """ + raise NotImplementedError + + def __repr__(self): + s = "BoltArray\n" + s += "mode: %s\n" % self._mode + s += "shape: %s\n" % str(self.shape) + return s diff --git a/bolt/array/chunk.py b/bolt/array/chunk.py new file mode 100644 index 0000000..9f0f7a9 --- /dev/null +++ b/bolt/array/chunk.py @@ -0,0 +1,677 @@ +from numpy import zeros, ones, asarray, r_, concatenate, arange, ceil, prod, \ + empty, mod, floor, any, ndarray, amin, amax, array_equal, squeeze, array, \ + where, random, ravel_multi_index + +from itertools import product + +from bolt.utils import tuplesort, tupleize, allstack, iterexpand +from bolt.array.array import BoltArraySpark + + +class ChunkedArray(object): + """ + Wraps a BoltArraySpark and provides an interface for chunking + into subarrays and performing operations on chunks. Many methods will + be restricted until the chunked array is unchunked. + + The general form supports axis movement during chunking, specifically, + moving axes from keys to values and vice versa. For every + value-dimension that becomes a key, the values are sliced along that + dimension into 'chunks' of a user-specified size. This is an + intermediate form that can be transformed back into a BoltSparkArray. + """ + _metadata = ['_shape', '_split', '_dtype', '_plan', '_padding', '_ordered'] + + def __init__(self, rdd, shape=None, split=None, dtype=None, plan=None, padding=None, ordered=None): + self._rdd = rdd + self._shape = shape + self._split = split + self._dtype = dtype + self._plan = plan + self._padding = padding + self._ordered = ordered + + @property + def dtype(self): + return self._dtype + + @property + def shape(self): + return self._shape + + @property + def split(self): + return self._split + + @property + def plan(self): + return self._plan + + @property + def padding(self): + return self._padding + + @property + def uniform(self): + return all([mod(x, y) == 0 for x, y in zip(self.vshape, self.plan)]) + + @property + def padded(self): + return not all([p == 0 for p in self.padding]) + + @property + def kshape(self): + return asarray(self._shape[:self._split]) + + @property + def vshape(self): + return asarray(self._shape[self._split:]) + + def kmask(self, axes): + return self.getmask(axes, len(self.kshape)) + + def vmask(self, axes): + return self.getmask(axes, len(self.vshape)) + + @property + def _constructor(self): + return ChunkedArray + + def __finalize__(self, other): + for name in self._metadata: + other_attr = getattr(other, name, None) + if (other_attr is not None) and (getattr(self, name, None) is None): + object.__setattr__(self, name, other_attr) + return self + + def _chunk(self, size="150", axis=None, padding=None): + """ + Split values of distributed array into chunks. + + Transforms an underlying pair RDD of (key, value) into + records of the form: (key, chunk id), (chunked value). + Here, chunk id is a tuple identifying the chunk and + chunked value is a subset of the data from each original value, + that has been divided along the specified dimensions. + + Parameters + ---------- + size : str or tuple or int + If str, the average size (in KB) of the chunks in all value dimensions. + If int or tuple, an explicit specification of the number chunks in + each value dimension. + + axis : tuple, optional, default=None + One or more axes to estimate chunks for, if provided any + other axes will use one chunk. + + padding: tuple or int, default = None + Number of elements per dimension that will overlap with the adjacent chunk. + If a tuple, specifies padding along each chunked dimension; if a int, same + padding will be applied to all chunked dimensions. + """ + if self.split == len(self.shape) and padding is None: + self._rdd = self._rdd.map(lambda kv: (kv[0]+(0,), array(kv[1], ndmin=1))) + self._shape = self._shape + (1,) + self._plan = (1,) + self._padding = array([0]) + return self + + rdd = self._rdd + self._plan, self._padding = self.getplan(size, axis, padding) + + if any([x + y > z for x, y, z in zip(self.plan, self.padding, self.vshape)]): + raise ValueError("Chunk sizes %s plus padding sizes %s cannot exceed value dimensions %s along any axis" + % (tuple(self.plan), tuple(self.padding), tuple(self.vshape))) + + if any([x > y for x, y in zip(self.padding, self.plan)]): + raise ValueError("Padding sizes %s cannot exceed chunk sizes %s along any axis" + % (tuple(self.padding), tuple(self.plan))) + + slices = self.getslices(self.plan, self.padding, self.vshape) + labels = list(product(*[list(enumerate(s)) for s in slices])) + scheme = [list(zip(*s)) for s in labels] + + def _chunk(record): + k, v = record[0], record[1] + for (chk, slc) in scheme: + if type(k) is int: + k = (k,) + yield k + chk, v[slc] + + rdd = rdd.flatMap(_chunk) + return self._constructor(rdd, shape=self.shape, split=self.split, + dtype=self.dtype, plan=self.plan, padding=self.padding, ordered=self._ordered) + + def unchunk(self): + """ + Convert a chunked array back into a full array with (key,value) pairs + where key is a tuple of indices, and value is an ndarray. + """ + plan, padding, vshape, split = self.plan, self.padding, self.vshape, self.split + nchunks = self.getnumber(plan, vshape) + full_shape = concatenate((nchunks, plan)) + n = len(vshape) + perm = concatenate(list(zip(range(n), range(n, 2*n)))) + + if self.uniform: + def _unchunk(it): + ordered = sorted(it, key=lambda kv: kv[0][split:]) + keys, values = zip(*ordered) + yield keys[0][:split], asarray(values).reshape(full_shape).transpose(perm).reshape(vshape) + else: + def _unchunk(it): + ordered = sorted(it, key=lambda kv: kv[0][split:]) + keys, values = zip(*ordered) + k_chks = [k[split:] for k in keys] + arr = empty(nchunks, dtype='object') + for (i, d) in zip(k_chks, values): + arr[i] = d + yield keys[0][:split], allstack(arr.tolist()) + + # remove padding + if self.padded: + removepad = self.removepad + rdd = self._rdd.map(lambda kv: (kv[0], removepad(kv[0][split:], kv[1], nchunks, padding, axes=range(n)))) + else: + rdd = self._rdd + + # skip partitionBy if there is not actually any chunking + if array_equal(self.plan, self.vshape): + rdd = rdd.map(lambda kv: (kv[0][:split], kv[1])) + ordered = self._ordered + else: + ranges = self.kshape + npartitions = int(prod(ranges)) + if len(self.kshape) == 0: + partitioner = lambda k: 0 + else: + partitioner = lambda k: ravel_multi_index(k[:split], ranges) + rdd = rdd.partitionBy(numPartitions=npartitions, partitionFunc=partitioner).mapPartitions(_unchunk) + ordered = True + + if array_equal(self.vshape, [1]): + rdd = rdd.mapValues(lambda v: squeeze(v)) + newshape = self.shape[:-1] + else: + newshape = self.shape + + return BoltArraySpark(rdd, shape=newshape, split=self._split, + dtype=self.dtype, ordered=ordered) + + def keys_to_values(self, axes, size=None): + """ + Move indices in the keys into the values. + + Padding on these new value-dimensions is not currently supported and is set to 0. + + Parameters + ---------- + axes : tuple + Axes from keys to move to values. + + size : tuple, optional, default=None + Size of chunks for the values along the new dimensions. + If None, then no chunking for all axes (number of chunks = 1) + + Returns + ------- + ChunkedArray + """ + if len(axes) == 0: + return self + + kmask = self.kmask(axes) + + if size is None: + size = self.kshape[kmask] + + # update properties + newplan = r_[size, self.plan] + newsplit = self._split - len(axes) + newshape = tuple(r_[self.kshape[~kmask], self.kshape[kmask], self.vshape].astype(int).tolist()) + newpadding = r_[zeros(len(axes), dtype=int), self.padding] + + result = self._constructor(None, shape=newshape, split=newsplit, + dtype=self.dtype, plan=newplan, padding=newpadding, ordered=True) + + # convert keys into chunk + within-chunk label + split = self.split + def _relabel(record): + k, data = record + keys, chks = asarray(k[:split], 'int'), k[split:] + movingkeys, stationarykeys = keys[kmask], keys[~kmask] + newchks = [int(m) for m in movingkeys/size] # element-wise integer division that works in Python 2 and 3 + labels = mod(movingkeys, size) + return tuple(stationarykeys) + tuple(newchks) + tuple(chks) + tuple(labels), data + + rdd = self._rdd.map(_relabel) + + # group the new chunks together + nchunks = result.getnumber(result.plan, result.vshape) + npartitions = int(prod(result.kshape) * prod(nchunks)) + ranges = tuple(result.kshape) + tuple(nchunks) + n = len(axes) + if n == 0: + s = slice(None) + else: + s = slice(-n) + partitioner = lambda k: ravel_multi_index(k[s], ranges) + + rdd = rdd.partitionBy(numPartitions=npartitions, partitionFunc=partitioner) + + # reassemble the pieces in the chunks by sorting and then stacking + uniform = result.uniform + + def _rebuild(it): + ordered = sorted(it, key=lambda kv: kv[0][n:]) + keys, data = zip(*ordered) + + k = keys[0][s] + labels = asarray([x[-n:] for x in keys]) + + if uniform: + labelshape = tuple(size) + else: + labelshape = tuple(amax(labels, axis=0) - amin(labels, axis=0) + 1) + + valshape = data[0].shape + fullshape = labelshape + valshape + yield k, asarray(data).reshape(fullshape) + + result._rdd = rdd.mapPartitions(_rebuild) + + if array_equal(self.vshape, [1]): + result._rdd = result._rdd.mapValues(lambda v: squeeze(v)) + result._shape = result.shape[:-1] + result._plan = result.plan[:-1] + + return result + + def values_to_keys(self, axes): + + vmask = self.vmask(axes) + split = self.split + + # update properties + newplan = self.plan[~vmask] + newsplit = split + len(axes) + newshape = tuple(r_[self.kshape, self.vshape[vmask], self.vshape[~vmask]].astype(int).tolist()) + newpadding = self.padding[~vmask] + + result = self._constructor(None, shape=newshape, split=newsplit, + dtype=self.dtype, plan=newplan, padding=newpadding, ordered=self._ordered) + + # remove padding + if self.padded: + plan, padding = self.plan, self.padding + nchunks = self.getnumber(plan, self.vshape) + removepad = self.removepad + rdd = self._rdd.map(lambda kv: (kv[0], removepad(kv[0][split:], kv[1], nchunks, padding, axes=axes))) + else: + rdd = self._rdd + + # extract new records + slices = [None if vmask[i] else slice(0, self.vshape[i], 1) for i in range(len(vmask))] + slices = asarray(slices) + + movingsizes = self.plan[vmask] + split = self.split + def _extract(record): + + keys, data = record + k, chk = keys[:split], keys[split:] + + movingchks = asarray(chk)[vmask] + newchks = tuple(asarray(chk)[~vmask]) + keyoffsets = prod([movingchks, movingsizes], axis=0) + + bounds = asarray(data.shape)[vmask] + indices = list(product(*map(lambda x: arange(x), bounds))) + + for b in indices: + s = slices.copy() + s[vmask] = b + newdata = data[tuple(s)] + newkeys = tuple(r_[k, keyoffsets + b].astype('int')) + yield newkeys + newchks, newdata + + result._rdd = rdd.flatMap(_extract) + + if len(result.vshape) == 0: + result._rdd = result._rdd.mapValues(lambda v: array(v, ndmin=1)) + result._shape = result._shape + (1,) + result._plan = (1,) + result._padding = array([0]) + + return result + + def map(self, func, value_shape=None, dtype=None): + """ + Apply an array -> array function on each subarray. + + The function can change the shape of the subarray, but only along + dimensions that are not chunked. + + Parameters + ---------- + func : function + Function of a single subarray to apply + + value_shape: + Known shape of chunking plan after the map + + dtype: numpy.dtype, optional, default=None + Known dtype of values resulting from operation + + Returns + ------- + ChunkedArray + """ + + if value_shape is None or dtype is None: + # try to compute the size of each mapped element by applying func to a random array + try: + mapped = func(random.randn(*self.plan).astype(self.dtype)) + except Exception: + first = self._rdd.first() + if first: + # eval func on the first element + mapped = func(first[1]) + if value_shape is None: + value_shape = mapped.shape + if dtype is None: + dtype = mapped.dtype + + chunked_dims = where(self.plan != self.vshape)[0] + unchunked_dims = where(self.plan == self.vshape)[0] + + # check that no dimensions are dropped + if len(value_shape) != len(self.plan): + raise NotImplementedError('map on ChunkedArray cannot drop dimensions') + + # check that chunked dimensions did not change shape + if any([value_shape[i] != self.plan[i] for i in chunked_dims]): + raise ValueError('map cannot change the sizes of chunked dimensions') + + def check_and_apply(v): + new = func(v) + if len(unchunked_dims) > 0: + if any([new.shape[i] != value_shape[i] for i in unchunked_dims]): + raise Exception("Map operation did not produce values of uniform shape.") + if len(chunked_dims) > 0: + if any([v.shape[i] != new.shape[i] for i in chunked_dims]): + raise Exception("Map operation changed the size of a chunked dimension") + return new + + rdd = self._rdd.mapValues(check_and_apply) + + vshape = [value_shape[i] if i in unchunked_dims else self.vshape[i] for i in range(len(self.vshape))] + newshape = r_[self.kshape, vshape].astype(int).tolist() + + return self._constructor(rdd, shape=tuple(newshape), dtype=dtype, + plan=asarray(value_shape)).__finalize__(self) + + def map_generic(self, func): + """ + Apply a generic array -> object to each subarray + + The resulting object is a BoltArraySpark of dtype object where the + blocked dimensions are replaced with indices indication block ID. + """ + def process_record(val): + newval = empty(1, dtype="object") + newval[0] = func(val) + return newval + + rdd = self._rdd.mapValues(process_record) + + nchunks = self.getnumber(self.plan, self.vshape) + newshape = tuple([int(s) for s in r_[self.kshape, nchunks]]) + newsplit = len(self.shape) + return BoltArraySpark(rdd, shape=newshape, split=newsplit, ordered=self._ordered, dtype="object") + + def getplan(self, size="150", axes=None, padding=None): + """ + Identify a plan for chunking values along each dimension. + + Generates an ndarray with the size (in number of elements) of chunks + in each dimension. If provided, will estimate chunks for only a + subset of axes, leaving all others to the full size of the axis. + + Parameters + ---------- + size : string or tuple + If str, the average size (in KB) of the chunks in all value dimensions. + If int/tuple, an explicit specification of the number chunks in + each moving value dimension. + + axes : tuple, optional, default=None + One or more axes to estimate chunks for, if provided any + other axes will use one chunk. + + padding : tuple or int, option, default=None + Size over overlapping padding between chunks in each dimension. + If tuple, specifies padding along each chunked dimension; if int, + all dimensions use same padding; if None, no padding + """ + from numpy import dtype as gettype + + # initialize with all elements in one chunk + plan = self.vshape + + # check for subset of axes + if axes is None: + if isinstance(size, str): + axes = arange(len(self.vshape)) + else: + axes = arange(len(size)) + else: + axes = asarray(axes, 'int') + + # set padding + pad = array(len(self.vshape)*[0, ]) + if padding is not None: + pad[axes] = padding + + # set the plan + if isinstance(size, tuple): + plan[axes] = size + + elif isinstance(size, str): + # convert from kilobytes + size = 1000.0 * float(size) + + # calculate from dtype + elsize = gettype(self.dtype).itemsize + nelements = prod(self.vshape) + dims = self.vshape[self.vmask(axes)] + + if size <= elsize: + s = ones(len(axes)) + + else: + remsize = 1.0 * nelements * elsize + s = [] + for (i, d) in enumerate(dims): + minsize = remsize/d + if minsize >= size: + s.append(1) + remsize = minsize + continue + else: + s.append(min(d, floor(size/minsize))) + s[i+1:] = plan[i+1:] + break + + plan[axes] = s + + else: + raise ValueError("Chunk size not understood, must be tuple or int") + + return plan, pad + + @staticmethod + def removepad(idx, value, number, padding, axes=None): + """ + Remove the padding from chunks. + + Given a chunk and its corresponding index, use the plan and padding to remove any + padding from the chunk along with specified axes. + + Parameters + ---------- + idx: tuple or array-like + The chunk index, indicating which chunk this is. + + value: ndarray + The chunk that goes along with the index. + + number: ndarray or array-like + The number of chunks along each dimension. + + padding: ndarray or array-like + The padding scheme. + + axes: tuple, optional, default = None + The axes (in the values) along which to remove padding. + """ + if axes is None: + axes = range(len(number)) + mask = len(number)*[False, ] + for i in range(len(mask)): + if i in axes and padding[i] != 0: + mask[i] = True + + starts = [0 if (i == 0 or not m) else p for (i, m, p) in zip(idx, mask, padding)] + stops = [None if (i == n-1 or not m) else -p for (i, m, p, n) in zip(idx, mask, padding, number)] + slices = [slice(i1, i2) for (i1, i2) in zip(starts, stops)] + + return value[slices] + + @staticmethod + def getnumber(plan, shape): + """ + Obtain number of chunks for the given dimensions and chunk sizes. + + Given a plan for the number of chunks along each dimension, + calculate the number of chunks that this will lead to. + + Parameters + ---------- + plan: tuple or array-like + Size of chunks (in number of elements) along each dimensions. + Length must be equal to the number of dimensions. + + shape : tuple + Shape of array to be chunked. + """ + nchunks = [] + for size, d in zip(plan, shape): + nchunks.append(int(ceil(1.0 * d/size))) + return nchunks + + @staticmethod + def getslices(plan, padding, shape): + """ + Obtain slices for the given dimensions, padding, and chunks. + + Given a plan for the number of chunks along each dimension and the amount of padding, + calculate a list of slices required to generate those chunks. + + Parameters + ---------- + plan: tuple or array-like + Size of chunks (in number of elements) along each dimensions. + Length must be equal to the number of dimensions. + + padding: tuple or array-like + Size of overlap (in number of elements) between chunks along each dimension. + Length must be equal to the number of dimensions. + + shape: tuple + Dimensions of axes to be chunked. + """ + slices = [] + for size, pad, d in zip(plan, padding, shape): + nchunks = int(floor(d/size)) + remainder = d % size + start = 0 + dimslices = [] + for idx in range(nchunks): + end = start + size + # left endpoint + if idx == 0: + left = start + else: + left = start - pad + # right endpoint + if idx == nchunks: + right = end + else: + right = end + pad + dimslices.append(slice(left, right, 1)) + start = end + if remainder: + dimslices.append(slice(end - pad, d, 1)) + slices.append(dimslices) + return slices + + @staticmethod + def getmask(inds, n): + """ + Obtain a binary mask by setting a subset of entries to true. + + Parameters + ---------- + inds : array-like + Which indices to set as true. + + n : int + The length of the target mask. + """ + inds = asarray(inds, 'int') + mask = zeros(n, dtype=bool) + mask[inds] = True + return mask + + def tordd(self): + """ + Return the RDD wrapped by the ChunkedArray. + + Returns + ------- + RDD + """ + return self._rdd + + def cache(self): + """ + Cache the underlying RDD in memory. + """ + self._rdd.cache() + + def unpersist(self): + """ + Remove the underlying RDD from memory. + """ + self._rdd.unpersist() + + def __str__(self): + s = "Chunked BoltArray\n" + s += "shape: %s\n" % str(self.shape) + return s + + def __repr__(self): + string = str(self) + if array_equal(self.vshape, [1]): + newlines = [i for (i, char) in enumerate(string) if char=='\n'] + string = string[:newlines[-2]+1] + string += "shape: %s\n" % str(self.shape[:-1]) + string += "chunk size: %s\n" % str(tuple(self.plan)) + if self.padded: + string += "padding: %s\n" % str(tuple(self.padding)) + else: + string += "padding: none\n" + + return string diff --git a/bolt/array/construct.py b/bolt/array/construct.py new file mode 100644 index 0000000..429ecc9 --- /dev/null +++ b/bolt/array/construct.py @@ -0,0 +1,212 @@ +from numpy import unravel_index, prod, arange, asarray, float64 + +from itertools import product + +from bolt.array.array import BoltArraySpark +from bolt.array.utils import get_kv_shape, get_kv_axes + + +def array(a, context=None, axis=(0,), dtype=None, npartitions=None): + """ + Create a spark bolt array from a local array. + + Parameters + ---------- + a : array-like + An array, any object exposing the array interface, an + object whose __array__ method returns an array, or any + (nested) sequence. + + context : SparkContext + A context running Spark. (see pyspark) + + axis : tuple, optional, default=(0,) + Which axes to distribute the array along. The resulting + distributed object will use keys to represent these axes, + with the remaining axes represented by values. + + dtype : data-type, optional, default=None + The desired data-type for the array. If None, will + be determined from the data. (see numpy) + + npartitions : int + Number of partitions for parallization. + + Returns + ------- + BoltArraySpark + """ + if dtype is None: + arry = asarray(a) + dtype = arry.dtype + else: + arry = asarray(a, dtype) + shape = arry.shape + ndim = len(shape) + + # handle the axes specification and transpose if necessary + axes = _format_axes(axis, arry.shape) + key_axes, value_axes = get_kv_axes(arry.shape, axes) + permutation = key_axes + value_axes + arry = arry.transpose(*permutation) + split = len(axes) + + if split < 1: + raise ValueError("split axis must be greater than 0, got %g" % split) + if split > len(shape): + raise ValueError("split axis must not exceed number of axes %g, got %g" % (ndim, split)) + + key_shape = shape[:split] + val_shape = shape[split:] + + keys = zip(*unravel_index(arange(0, int(prod(key_shape))), key_shape)) + vals = arry.reshape((prod(key_shape),) + val_shape) + + rdd = context.parallelize(zip(keys, vals), npartitions) + return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) + +def ones(shape, context=None, axis=(0,), dtype=float64, npartitions=None): + """ + Create a spark bolt array of ones. + + Parameters + ---------- + shape : tuple + The desired shape of the array. + + context : SparkContext + A context running Spark. (see pyspark) + + axis : tuple, optional, default=(0,) + Which axes to distribute the array along. The resulting + distributed object will use keys to represent these axes, + with the remaining axes represented by values. + + dtype : data-type, optional, default=float64 + The desired data-type for the array. If None, will + be determined from the data. (see numpy) + + npartitions : int + Number of partitions for parallization. + + Returns + ------- + BoltArraySpark + """ + from numpy import ones + return _wrap(ones, shape, context, axis, dtype, npartitions) + +def zeros(shape, context=None, axis=(0,), dtype=float64, npartitions=None): + """ + Create a spark bolt array of zeros. + + Parameters + ---------- + shape : tuple + The desired shape of the array. + + context : SparkContext + A context running Spark. (see pyspark) + + axis : tuple, optional, default=(0,) + Which axes to distribute the array along. The resulting + distributed object will use keys to represent these axes, + with the remaining axes represented by values. + + dtype : data-type, optional, default=float64 + The desired data-type for the array. If None, will + be determined from the data. (see numpy) + + npartitions : int + Number of partitions for parallization. + + Returns + ------- + BoltArraySpark + """ + from numpy import zeros + return _wrap(zeros, shape, context, axis, dtype, npartitions) + +def concatenate(arrays, axis=0): + """ + Join two bolt arrays together, at least one of which is in spark. + + Parameters + ---------- + arrays : tuple + A pair of arrays. At least one must be a spark array, + the other can be a local bolt array, a local numpy array, + or an array-like. + + axis : int, optional, default=0 + The axis along which the arrays will be joined. + + Returns + ------- + BoltArraySpark + """ + if not isinstance(arrays, tuple): + raise ValueError("data type not understood") + if not len(arrays) == 2: + raise NotImplementedError("spark concatenation only supports two arrays") + + first, second = arrays + if isinstance(first, BoltArraySpark): + return first.concatenate(second, axis) + elif isinstance(second, BoltArraySpark): + first = array(first, second._rdd.context) + return first.concatenate(second, axis) + else: + raise ValueError("at least one array must be a spark bolt array") + +def _argcheck(*args, **kwargs): + """ + Check that arguments are consistent with spark array construction. + + Conditions are: + (1) a positional argument is a SparkContext + (2) keyword arg 'context' is a SparkContext + (3) an argument is a BoltArraySpark, or + (4) an argument is a nested list containing a BoltArraySpark + """ + try: + from pyspark import SparkContext + except ImportError: + return False + + cond1 = any([isinstance(arg, SparkContext) for arg in args]) + cond2 = isinstance(kwargs.get('context', None), SparkContext) + cond3 = any([isinstance(arg, BoltArraySpark) for arg in args]) + cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg]) + if isinstance(arg, (tuple, list)) else False for arg in args]) + return cond1 or cond2 or cond3 or cond4 + +def _format_axes(axes, shape): + """ + Format target axes given an array shape + """ + if isinstance(axes, int): + axes = (axes,) + elif isinstance(axes, list) or hasattr(axes, '__iter__'): + axes = tuple(axes) + if not isinstance(axes, tuple): + raise ValueError("axes argument %s in the constructor not specified correctly" % str(axes)) + if min(axes) < 0 or max(axes) > len(shape) - 1: + raise ValueError("invalid key axes %s given shape %s" % (str(axes), str(shape))) + return axes + +def _wrap(func, shape, context=None, axis=(0,), dtype=None, npartitions=None): + """ + Wrap an existing numpy constructor in a parallelized construction + """ + if isinstance(shape, int): + shape = (shape,) + key_shape, value_shape = get_kv_shape(shape, _format_axes(axis, shape)) + split = len(key_shape) + + # make the keys + rdd = context.parallelize(list(product(*[arange(x) for x in key_shape])), npartitions) + + # use a map to make the arrays in parallel + rdd = rdd.map(lambda x: (x, func(value_shape, dtype, order='C'))) + return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) diff --git a/bolt/array/shapes.py b/bolt/array/shapes.py new file mode 100644 index 0000000..5bd92e4 --- /dev/null +++ b/bolt/array/shapes.py @@ -0,0 +1,167 @@ +from numpy import unravel_index, ravel_multi_index + +from bolt.utils import argpack, istransposeable, isreshapeable +from bolt.array.array import BoltArraySpark + + +class Shapes(object): + """ + Base Shape class. These classes wrap a BoltArraySpark in their + entirity, but implement the following attributes and methods as if + they were only working on the keys or the values, depending which + subclass is used. + """ + @property + def shape(self): + raise NotImplementedError + + @property + def ndim(self): + return len(self.shape) + + def reshape(self): + raise NotImplementedError + + def transpose(self): + raise NotImplementedError + +class Keys(Shapes): + """ + This class implements all the base shape attributes and methods + for the keys of a BoltArraySpark. + """ + def __init__(self, barray): + self._barray = barray + + @property + def shape(self): + return self._barray.shape[:self._barray.split] + + def reshape(self, *shape): + """ + Reshape just the keys of a BoltArraySpark, returning a + new BoltArraySpark. + + Parameters + ---------- + shape : tuple + New proposed axes. + """ + new = argpack(shape) + old = self.shape + isreshapeable(new, old) + + if new == old: + return self._barray + + def f(k): + return unravel_index(ravel_multi_index(k, old), new) + + newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) + newsplit = len(new) + newshape = new + self._barray.values.shape + + return BoltArraySpark(newrdd, shape=newshape, split=newsplit).__finalize__(self._barray) + + def transpose(self, *axes): + """ + Transpose just the keys of a BoltArraySpark, returning a + new BoltArraySpark. + + Parameters + ---------- + axes : tuple + New proposed axes. + """ + new = argpack(axes) + old = range(self.ndim) + istransposeable(new, old) + + if new == old: + return self._barray + + def f(k): + return tuple(k[i] for i in new) + + newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) + newshape = tuple(self.shape[i] for i in new) + self._barray.values.shape + + return BoltArraySpark(newrdd, shape=newshape, ordered=False).__finalize__(self._barray) + + def __str__(self): + s = "BoltArray Keys\n" + s += "shape: %s" % str(self.shape) + return s + + def __repr__(self): + return str(self) + +class Values(Shapes): + """ + This class implements all the base shape attributes and methods + for the values of a BoltArraySpark. + """ + def __init__(self, barray): + self._barray = barray + + @property + def shape(self): + return self._barray.shape[self._barray.split:] + + def reshape(self, *shape): + """ + Reshape just the values of a BoltArraySpark, returning a + new BoltArraySpark. + + Parameters + ---------- + shape : tuple + New proposed axes. + """ + new = argpack(shape) + old = self.shape + isreshapeable(new, old) + + if new == old: + return self._barray + + def f(v): + return v.reshape(new) + + newrdd = self._barray._rdd.mapValues(f) + newshape = self._barray.keys.shape + new + + return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) + + def transpose(self, *axes): + """ + Transpose just the values of a BoltArraySpark, returning a + new BoltArraySpark. + + Parameters + ---------- + axes : tuple + New proposed axes. + """ + new = argpack(axes) + old = range(self.ndim) + istransposeable(new, old) + + if new == old: + return self._barray + + def f(v): + return v.transpose(new) + + newrdd = self._barray._rdd.mapValues(f) + newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new) + + return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) + + def __str__(self): + s = "BoltArray Values\n" + s += "shape: %s" % str(self.shape) + return s + + def __repr__(self): + return str(self) diff --git a/bolt/array/stack.py b/bolt/array/stack.py new file mode 100644 index 0000000..6071389 --- /dev/null +++ b/bolt/array/stack.py @@ -0,0 +1,154 @@ +from numpy import asarray, ndarray, concatenate +from bolt.array.utils import zip_with_index + +class StackedArray(object): + """ + Wraps a BoltArraySpark and provides an interface for performing + stacked operations (operations on aggregated subarrays). Many methods + will be restricted or forbidden until the Stacked object is + unstacked. Currently, only map() is implemented. The rationale + is that many operations will work faster when vectorized over a + slightly larger array. + + The implementation uses an intermediate RDD that collects all + records on a given partition into 'stacked' (key, value) records. + Here, a key is a 'size' long tuple of original record keys, + and and values is a an array of the corresponding values, + concatenated along a new 0th dimenion. + """ + _metadata = ['_rdd', '_shape', '_split', '_rekeyed'] + + def __init__(self, rdd, shape=None, split=None, rekeyed=False): + self._rdd = rdd + self._shape = shape + self._split = split + self._rekeyed = rekeyed + + def __finalize__(self, other): + for name in self._metadata: + other_attr = getattr(other, name, None) + if (other_attr is not None) and (getattr(self, name, None) is None): + object.__setattr__(self, name, other_attr) + return self + + @property + def shape(self): + return self._shape + + @property + def split(self): + return self._split + + @property + def rekey(self): + return self._rekeyed + + @property + def _constructor(self): + return StackedArray + + def stack(self, size): + """ + Make an intermediate RDD where all records are combined into a + list of keys and larger ndarray along a new 0th dimension. + """ + def tostacks(partition): + keys = [] + arrs = [] + for key, arr in partition: + keys.append(key) + arrs.append(arr) + if size and 0 <= size <= len(keys): + yield (keys, asarray(arrs)) + keys, arrs = [], [] + if keys: + yield (keys, asarray(arrs)) + + rdd = self._rdd.mapPartitions(tostacks) + return self._constructor(rdd).__finalize__(self) + + def unstack(self): + """ + Unstack array and return a new BoltArraySpark via flatMap(). + """ + from bolt.array.array import BoltArraySpark + + if self._rekeyed: + rdd = self._rdd + else: + rdd = self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))) + + return BoltArraySpark(rdd, shape=self.shape, split=self.split) + + def map(self, func): + """ + Apply a function on each subarray. + + Parameters + ---------- + func : function + This is applied to each value in the intermediate RDD. + + Returns + ------- + StackedArray + """ + vshape = self.shape[self.split:] + x = self._rdd.values().first() + if x.shape == vshape: + a, b = asarray([x]), asarray([x, x]) + else: + a, b = x, concatenate((x, x)) + + try: + atest = func(a) + btest = func(b) + except Exception as e: + raise RuntimeError("Error evaluating function on test array, got error:\n %s" % e) + + if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)): + raise ValueError("Function must return ndarray") + + # different shapes map to the same new shape + elif atest.shape == btest.shape: + if self._rekeyed is True: + # we've already rekeyed + rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) + shape = (self.shape[0],) + atest.shape + else: + # do the rekeying + count, rdd = zip_with_index(self._rdd.values()) + rdd = rdd.map(lambda kv: ((kv[1],), func(kv[0]))) + shape = (count,) + atest.shape + split = 1 + rekeyed = True + + # different shapes stay different (along the first dimension) + elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]: + shape = self.shape[0:self.split] + atest.shape[1:] + split = self.split + rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1]))) + rekeyed = self._rekeyed + + else: + raise ValueError("Cannot infer effect of function on shape") + + return self._constructor(rdd, rekeyed=rekeyed, shape=shape, split=split).__finalize__(self) + + def tordd(self): + """ + Return the RDD wrapped by the StackedArray. + + Returns + ------- + RDD + """ + return self._rdd + + def __str__(self): + s = "Stacked BoltArray\n" + s += "shape: %s\n" % str(self.shape) + return s + + def __repr__(self): + return str(self) diff --git a/bolt/array/statcounter.py b/bolt/array/statcounter.py new file mode 100644 index 0000000..162f6eb --- /dev/null +++ b/bolt/array/statcounter.py @@ -0,0 +1,130 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is ported from spark/util/StatCounter.scala +# +# This code is based on pyspark's statcounter.py and used under the ASF 2.0 license. + +import copy +from itertools import chain + +from numpy import sqrt + + +class StatCounter(object): + + REQUIRED_FOR = { + 'mean': ('mu',), + 'sum': ('mu',), + 'variance': ('mu', 'm2'), + 'stdev': ('mu', 'm2'), + 'all': ('mu', 'm2') + } + + def __init__(self, values=(), stats='all'): + self.n = 0 + self.mu = 0.0 + self.m2 = 0.0 + + if isinstance(stats, str): + stats = [stats] + self.required = frozenset(chain().from_iterable([StatCounter.REQUIRED_FOR[stat] for stat in stats])) + + for v in values: + self.merge(v) + + # add a value into this StatCounter, updating the statistics + def merge(self, value): + self.n += 1 + if self.__requires('mu'): + delta = value - self.mu + self.mu += delta / self.n + if self.__requires('m2'): + self.m2 += delta * (value - self.mu) + + return self + + # checks whether the passed attribute name is required to be updated in order to support the + # statistics requested in self.requested + def __requires(self, attrname): + return attrname in self.required + + # merge another StatCounter into this one, adding up the statistics + def combine(self, other): + if not isinstance(other, StatCounter): + raise Exception("can only merge StatCounters!") + + # reference equality holds + if other is self: + # avoid overwriting fields in a weird order + self.merge(copy.deepcopy(other)) + else: + # accumulator should only be updated if it's valid in both statcounters + self.required = set(self.required).intersection(set(other.required)) + + if self.n == 0: + self.n = other.n + for attrname in ('mu', 'm2'): + if self.__requires(attrname): + setattr(self, attrname, getattr(other, attrname)) + + elif other.n != 0: + if self.__requires('mu'): + delta = other.mu - self.mu + if other.n * 10 < self.n: + self.mu = self.mu + (delta * other.n) / (self.n + other.n) + elif self.n * 10 < other.n: + self.mu = other.mu - (delta * self.n) / (self.n + other.n) + else: + self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n) + + if self.__requires('m2'): + self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n) + + self.n += other.n + return self + + def count(self): + return self.n + + def __isavail(self, attrname): + if not all(attr in self.required for attr in StatCounter.REQUIRED_FOR[attrname]): + raise ValueError("'%s' stat not available, must be requested at " + "StatCounter instantiation" % attrname) + + @property + def mean(self): + self.__isavail('mean') + return self.mu + + @property + def sum(self): + self.__isavail('sum') + return self.n * self.mu + + @property + def variance(self): + self.__isavail('variance') + if self.n == 0: + return float('nan') + else: + return self.m2 / self.n + + @property + def stdev(self): + self.__isavail('stdev') + return sqrt(self.variance) diff --git a/bolt/array/utils.py b/bolt/array/utils.py new file mode 100644 index 0000000..c006dd2 --- /dev/null +++ b/bolt/array/utils.py @@ -0,0 +1,31 @@ +def get_kv_shape(shape, key_axes): + func = lambda axis: shape[axis] + return _get_kv_func(func, shape, key_axes) + +def get_kv_axes(shape, key_axes): + func = lambda axis: axis + return _get_kv_func(func, shape, key_axes) + +def _get_kv_func(func, shape, key_axes): + key_res = [func(axis) for axis in key_axes] + value_res = [func(axis) for axis in range(len(shape)) if axis not in key_axes] + return key_res, value_res + +def zip_with_index(rdd): + """ + Alternate version of Spark's zipWithIndex that eagerly returns count. + """ + starts = [0] + if rdd.getNumPartitions() > 1: + nums = rdd.mapPartitions(lambda it: [sum(1 for _ in it)]).collect() + count = sum(nums) + for i in range(len(nums) - 1): + starts.append(starts[-1] + nums[i]) + else: + count = rdd.count() + + def func(k, it): + for i, v in enumerate(it, starts[k]): + yield v, i + + return count, rdd.mapPartitionsWithIndex(func) diff --git a/test/test_spark_basic.py b/test/test_spark_basic.py new file mode 100644 index 0000000..f84653a --- /dev/null +++ b/test/test_spark_basic.py @@ -0,0 +1,160 @@ +from numpy import arange, dtype, int64, float64 +from bolt import array, ones +from bolt.utils import allclose + +def test_shape(sc): + + x = arange(2*3).reshape((2, 3)) + b = array(x, sc) + assert b.shape == x.shape + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc) + assert b.shape == x.shape + +def test_size(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=0) + assert b.size == x.size + +def test_split(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=0) + assert b.split == 1 + + b = array(x, sc, axis=(0, 1)) + assert b.split == 2 + +def test_ndim(sc): + + x = arange(2**5).reshape(2, 2, 2, 2, 2) + b = array(x, sc, axis=(0, 1, 2)) + + assert b.keys.ndim == 3 + assert b.values.ndim == 2 + assert b.ndim == 5 + +def test_mask(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=0) + assert b.mask == (1, 0, 0) + + b = array(x, sc, axis=(0, 1)) + assert b.mask == (1, 1, 0) + + b = array(x, sc, axis=(0, 1, 2)) + assert b.mask == (1, 1, 1) + +def test_cache(sc): + + x = arange(2*3).reshape((2, 3)) + b = array(x, sc) + b.cache() + assert b._rdd.is_cached + b.unpersist() + assert not b._rdd.is_cached + +def test_repartition(sc): + x = arange(2 * 3).reshape((2, 3)) + b = array(x, sc) + assert b._ordered + b = b.repartition(10) + assert not b._ordered + assert b._rdd.getNumPartitions() == 10 + +def test_concatenate(sc): + + from numpy import concatenate + from numpy import array as npArray + x = arange(2*3).reshape((2, 3)) + b = array(x, sc) + c = npArray(x) + assert allclose(b.concatenate(x).toarray(), concatenate((x, x))) + assert allclose(b.concatenate(b).toarray(), concatenate((x, x))) + assert allclose(b.concatenate(c).toarray(), concatenate((x, x))) + +def test_dtype(sc): + + a = arange(2**8, dtype=int64) + b = array(a, sc, dtype=int64) + assert a.dtype == b.dtype + assert b.dtype == dtype(int64) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(int64) + + a = arange(2.0**8) + b = array(a, sc) + assert a.dtype == b.dtype + assert b.dtype == dtype(float64) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(float64) + + a = arange(2**8) + b = array(a, sc) + assert a.dtype == b.dtype + assert b.dtype == dtype(int64) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(int64) + + from numpy import ones as npones + a = npones(2**8, dtype=bool) + b = array(a, sc) + assert a.dtype == b.dtype + assert b.dtype == dtype(bool) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(bool) + + b = ones(2**8, sc) + assert b.dtype == dtype(float64) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(float64) + + b = ones(2**8, sc, dtype=bool) + assert b.dtype == dtype(bool) + dtypes = b._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(bool) + +def test_astype(sc): + + from numpy import ones as npones + + a = npones(2**8, dtype=int64) + b = array(a, sc, dtype=int64) + c = b.astype(bool) + assert c.dtype == dtype(bool) + dtypes = c._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(bool) + + b = ones((100, 100), sc, dtype=int64) + c = b.astype(bool) + assert c.dtype == dtype(bool) + dtypes = c._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(bool) + + b = ones((100, 100), sc) + c = b.astype(bool) + assert c.dtype == dtype(bool) + dtypes = c._rdd.map(lambda x: x[1].dtype).collect() + for dt in dtypes: + assert dt == dtype(bool) + +def test_clip(sc): + + from numpy import arange + + a = arange(4).reshape(2, 2) + b = array(a, sc) + assert allclose(b.clip(0).toarray(), a.clip(0)) + assert allclose(b.clip(2).toarray(), a.clip(2)) + assert allclose(b.clip(1, 2).toarray(), a.clip(1, 2)) diff --git a/test/test_spark_chunking.py b/test/test_spark_chunking.py new file mode 100644 index 0000000..277fa2e --- /dev/null +++ b/test/test_spark_chunking.py @@ -0,0 +1,208 @@ +import pytest +from numpy import arange, split, array_equal, empty, newaxis, asarray +from bolt import array, ones +from bolt.utils import allclose + +def test_chunk(sc): + + x = arange(4*6).reshape(1, 4, 6) + b = array(x, sc) + + k1, v1 = zip(*b.chunk((2,3))._rdd.sortByKey().collect()) + k2 = ((0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1)) + v2 = [s for m in split(x[0], (2,), axis=0) for s in split(m, (3,), axis=1)] + assert k1 == k2 + assert all([allclose(m1, m2) for (m1, m2) in zip(v1, v2)]) + + k1, v1 = zip(*b.chunk((3,4))._rdd.sortByKey().collect()) + k2 = ((0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1)) + v2 = [s for m in split(x[0], (3,), axis=0) for s in split(m, (4,), axis=1)] + assert k1 == k2 + assert all([allclose(m1, m2) for (m1, m2) in zip(v1, v2)]) + +def test_unchunk(sc): + + x = arange(4*6).reshape(1, 4, 6) + b = array(x, sc) + + assert allclose(b.chunk((2, 3)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk((3, 4)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk((4, 6)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk('0.1').unchunk().toarray(), b.toarray()) + assert allclose(b.chunk().unchunk().toarray(), b.toarray()) + + x = arange(4*5*10).reshape(1, 4, 5, 10) + b = array(x, sc) + + assert allclose(b.chunk((4, 5, 10)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk((1, 1, 1)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk((3, 3, 3)).unchunk().toarray(), b.toarray()) + assert allclose(b.chunk((3, 3, 3)).unchunk().toarray(), b.toarray()) + + x = arange(4*6).reshape(4, 6) + b = array(x, sc, (0, 1)) + + assert allclose(b.chunk(()).unchunk().toarray(), b.toarray()) + + b = array(x, sc, (0,)) + + assert allclose(b.chunk((2)).unchunk().toarray(), b.toarray()) + +def test_keys_to_values(sc): + + x = arange(4*7*9*6).reshape(4, 7, 9, 6) + b = array(x, sc, (0, 1)) + c = b.chunk((4, 2)) + + assert allclose(x, c.keys_to_values((0,)).unchunk().toarray().transpose(1, 0, 2, 3)) + assert allclose(x, c.keys_to_values((1,)).unchunk().toarray()) + assert allclose(x, c.keys_to_values((1,), size=(3,)).unchunk().toarray()) + assert allclose(x, c.keys_to_values((0, 1)).unchunk().toarray()) + assert allclose(x, c.keys_to_values((0, 1), size=(2, 3)).unchunk().toarray()) + assert allclose(x, c.keys_to_values(()).unchunk().toarray()) + + b = array(x, sc, range(4)) + c = b.chunk(()) + + assert allclose(x, c.keys_to_values((3,)).unchunk().toarray()) + assert allclose(x, c.keys_to_values((0, 1)).unchunk().toarray().transpose(2, 3, 0, 1)) + + b = array(x, sc, (0,)) + c = b.chunk((2, 3, 4)) + + assert allclose(x, c.keys_to_values((0,)).unchunk().toarray()) + +def test_values_to_keys(sc): + + x = arange(4*7*9*6).reshape(4, 7, 9, 6) + b = array(x, sc, (0, 1)) + c = b.chunk((4, 2)) + + assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) + assert allclose(x, c.values_to_keys((1,)).unchunk().toarray().transpose(0, 1, 3, 2)) + assert allclose(x, c.values_to_keys((0, 1)).unchunk().toarray()) + assert allclose(x, c.values_to_keys(()).unchunk().toarray()) + + b = array(x, sc, (0,)) + c = b.chunk((2, 3, 4)) + + assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) + assert allclose(x, c.values_to_keys((0, 1)).unchunk().toarray()) + + +def test_padding(sc): + + x = arange(2*2*5*6).reshape(2, 2, 5, 6) + b = array(x, sc, (0, 1)) + + c = b.chunk((2, 2), padding=1) + chunks = c.tordd().sortByKey().values().collect() + assert allclose(chunks[0], asarray([[0, 1, 2], [6, 7, 8], [12, 13, 14]])) + assert allclose(chunks[1], asarray([[1, 2, 3, 4], [7, 8, 9, 10], [13, 14, 15, 16]])) + assert allclose(chunks[4], asarray([[7, 8, 9, 10], [13, 14, 15, 16], [19, 20, 21, 22], [25, 26, 27, 28]])) + assert allclose(chunks[6], asarray([[18, 19, 20], [24, 25, 26]])) + + c = b.chunk((3, 3), padding=(1, 2)) + chunks = c.tordd().sortByKey().values().collect() + assert allclose(chunks[0], asarray([[0, 1, 2, 3, 4], [6, 7, 8, 9, 10], [12, 13, 14, 15, 16], [18, 19, 20, 21, 22]])) + + c = b.chunk((2,2), padding=1) + assert allclose(x, c.unchunk().toarray()) + assert allclose(x, c.keys_to_values((1,)).unchunk().toarray()) + assert allclose(x, c.values_to_keys((0,)).unchunk().toarray()) + +def test_padding_errors(sc): + + x = arange(2*2*5*6).reshape(2, 2, 5, 6) + b = array(x, sc, (0, 1)) + + with pytest.raises(ValueError): + c = b.chunk((2, 2), padding=(3, 1)) + + with pytest.raises(ValueError): + c = b.chunk((4, 4), padding=(2, 2)) + + with pytest.raises(NotImplementedError): + c = b.chunk((2, 2), padding=1) + d = c.map(lambda x: x[:, 0]) + +def test_map(sc): + + x = arange(4*8*8).reshape(4, 8, 8) + b = array(x, sc) + + c = b.chunk(size=(4, 8)) + + # no change of shape + def f(x): + return 2*x + + assert allclose(c.map(f).unchunk().toarray(), f(x)) + assert allclose(c.map(f, value_shape=(4, 8)).unchunk().toarray(), f(x)) + + # changing the size of an unchunked axis + def f(x): + return x[:, :4] + def f_local(x): + return x[:, :, :4] + + assert allclose(c.map(f).unchunk().toarray(), f_local(x)) + assert allclose(c.map(f, value_shape=(4, 4)).unchunk().toarray(), f_local(x)) + +def test_map_errors(sc): + + x = arange(4*8*8).reshape(4, 8, 8) + b = array(x, sc) + + c = b.chunk(size=(4, 8)) + + # changing the size of a chunked axis + def f(x): + return x[:2, :] + + with pytest.raises(ValueError): + c.map(f) + + with pytest.raises(ValueError): + c.map(f, value_shape=(2, 8)) + + # dropping dimensions + def f(x): + return x[0, :] + + with pytest.raises(NotImplementedError): + c.map(f) + + with pytest.raises(NotImplementedError): + c.map(f, value_shape=(4,)) + +def test_map_generic(sc): + + x = arange(2*8*8).reshape(2, 8, 8) + b = array(x, sc) + + c = b.chunk(size=(8, 5)) + d = c.map_generic(lambda x: [0, 1]).toarray() + + truth = empty(2*1*2, dtype=object) + for i in range(truth.shape[0]): + truth[i] = [0, 1] + truth = truth.reshape(2, 1, 2) + + assert array_equal(d, truth) + +def test_properties(sc): + + x = arange(4*6).reshape(1, 4, 6) + b = array(x, sc) + + assert b.chunk(size=(2, 3)).uniform is True + assert b.chunk(size=(2, 4)).uniform is False + +def test_args(sc): + + x = arange(4*6).reshape(1, 4, 6) + b = array(x, sc) + + with pytest.raises(ValueError): + b.chunk(size=(5, 6)) diff --git a/test/test_spark_construct.py b/test/test_spark_construct.py new file mode 100644 index 0000000..a8e43e4 --- /dev/null +++ b/test/test_spark_construct.py @@ -0,0 +1,96 @@ +import pytest +from numpy import arange +from bolt import array, ones, zeros, concatenate +from bolt.utils import allclose +from bolt.array.array import BoltArraySpark + +def test_array(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc) + assert isinstance(b, BoltArraySpark) + assert allclose(x, b.toarray()) + + b = array(x, sc, axis=0) + assert isinstance(b, BoltArraySpark) + assert allclose(x, b.toarray()) + + b = array(x, sc, axis=(0, 1)) + assert isinstance(b, BoltArraySpark) + assert allclose(x, b.toarray()) + + b = array(x, sc, axis=(0, 1), npartitions=5) + assert isinstance(b, BoltArraySpark) + assert allclose(x, b.toarray()) + assert b.tordd().getNumPartitions() == 5 + +def test_array_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + with pytest.raises(ValueError): + array(x, sc, axis=-1) + + with pytest.raises(ValueError): + array(x, sc, axis=(0, 1, 2, 3)) + +def test_ones(sc): + + from numpy import ones as npones + x = npones((2, 3, 4)) + b = ones((2, 3, 4), sc) + assert allclose(x, b.toarray()) + + x = npones(5) + b = ones(5, sc) + assert allclose(x, b.toarray()) + +def test_zeros(sc): + + from numpy import zeros as npzeros + x = npzeros((2, 3, 4)) + b = zeros((2, 3, 4), sc) + assert allclose(x, b.toarray()) + + x = npzeros(5) + b = zeros(5, sc) + assert allclose(x, b.toarray()) + +def test_concatenate(sc): + + from numpy import concatenate as npconcatenate + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=0) + bb = concatenate((b, b), axis=0) + assert allclose(npconcatenate((x, x), axis=0), bb.toarray()) + + bb = concatenate((b, b), axis=1) + assert allclose(npconcatenate((x, x), axis=1), bb.toarray()) + + bb = concatenate((b, b), axis=2) + assert allclose(npconcatenate((x, x), axis=2), bb.toarray()) + + b = array(x, sc, axis=(0, 1)) + bb = concatenate((b, b), axis=0) + assert allclose(npconcatenate((x, x), axis=0), bb.toarray()) + + b = array(x, sc, axis=(0, 1)) + bb = concatenate((b, b), axis=1) + assert allclose(npconcatenate((x, x), axis=1), bb.toarray()) + + b = array(x, sc, axis=(0, 1)) + bb = concatenate((b, b), axis=2) + assert allclose(npconcatenate((x, x), axis=2), bb.toarray()) + +def test_concatenate_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=0) + + with pytest.raises(ValueError): + concatenate(b) + + with pytest.raises(NotImplementedError): + concatenate((b, b, b)) diff --git a/test/test_spark_functional.py b/test/test_spark_functional.py new file mode 100644 index 0000000..ad778dd --- /dev/null +++ b/test/test_spark_functional.py @@ -0,0 +1,118 @@ +import pytest +from numpy import arange, repeat +from bolt import array +from bolt.utils import allclose +import generic + +def test_map(sc): + import random + random.seed(42) + + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=0) + + # Test all map functionality when the base array is split after the first axis + generic.map_suite(x, b) + + # Split the BoltArraySpark after the second axis and rerun the tests + b = array(x, sc, axis=(0, 1)) + generic.map_suite(x, b) + + # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + b = array(x, sc, axis=(0, 1, 2)) + generic.map_suite(x, b) + +def test_map_with_keys(sc): + x = arange(2*3).reshape(2, 3) + b = array(x, sc, axis=0) + c = b.map(lambda kv: kv[0] + kv[1], with_keys=True) + assert allclose(b.toarray() + [[0, 0, 0], [1, 1, 1]], c.toarray()) + +def test_reduce(sc): + from numpy import asarray + + dims = (10, 10, 10) + area = dims[0] * dims[1] + arr = asarray([repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) + b = array(arr, sc, axis=0) + + # Test all reduce functionality when the base array is split after the first axis + generic.reduce_suite(arr, b) + + # Split the BoltArraySpark after the second axis and rerun the tests + b = array(arr, sc, axis=(0, 1)) + generic.reduce_suite(arr, b) + + # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + b = array(arr, sc, axis=(0, 1, 2)) + generic.reduce_suite(arr, b) + +def test_filter(sc): + + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=0) + + # Test all filter functionality when the base array is split after the first axis + generic.filter_suite(x, b) + + # Split the BoltArraySpark after the second axis and rerun the tests + b = array(x, sc, axis=(0, 1)) + generic.filter_suite(x, b) + + # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + b = array(x, sc, axis=(0, 1, 2)) + generic.filter_suite(x, b) + +def test_mean(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.mean(), x.mean()) + assert allclose(b.mean(axis=0), x.mean(axis=0)) + assert allclose(b.mean(axis=(0, 1)), x.mean(axis=(0, 1))) + assert b.mean(axis=(0, 1, 2)) == x.mean(axis=(0, 1, 2)) + +def test_std(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.std(), x.std()) + assert allclose(b.std(axis=0), x.std(axis=0)) + assert allclose(b.std(axis=(0, 1)), x.std(axis=(0, 1))) + assert b.std(axis=(0, 1, 2)) == x.std(axis=(0, 1, 2)) + +def test_var(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.var(), x.var()) + assert allclose(b.var(axis=0), x.var(axis=0)) + assert allclose(b.var(axis=(0, 1)), x.var(axis=(0, 1))) + assert b.var(axis=(0, 1, 2)) == x.var(axis=(0, 1, 2)) + +def test_sum(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.sum(), x.sum()) + assert allclose(b.sum(axis=0), x.sum(axis=0)) + assert allclose(b.sum(axis=(0, 1)), x.sum(axis=(0, 1))) + assert b.sum(axis=(0, 1, 2)) == x.sum(axis=(0, 1, 2)) + +def test_min(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.min(), x.min()) + assert allclose(b.min(axis=0), x.min(axis=0)) + assert allclose(b.min(axis=(0, 1)), x.min(axis=(0, 1))) + assert b.min(axis=(0, 1, 2)) == x.min(axis=(0, 1, 2)) + +def test_max(sc): + x = arange(2*3*4).reshape(2, 3, 4) + b = array(x, sc, axis=(0,)) + + assert allclose(b.max(), x.max()) + assert allclose(b.max(axis=0), x.max(axis=0)) + assert allclose(b.max(axis=(0, 1)), x.max(axis=(0, 1))) + assert b.max(axis=(0, 1, 2)) == x.max(axis=(0, 1, 2)) diff --git a/test/test_spark_getting.py b/test/test_spark_getting.py new file mode 100644 index 0000000..64f0358 --- /dev/null +++ b/test/test_spark_getting.py @@ -0,0 +1,170 @@ +import pytest +from numpy import arange +from bolt import array, ones +from bolt.utils import allclose + + +def test_getitem_slice(sc): + x = arange(6*6).reshape((6, 6)) + + b = array(x, sc, axis=0) + assert allclose(b[0:1, 0:1].toarray(), x[0:1, 0:1]) + assert allclose(b[0:2, 0:2].toarray(), x[0:2, 0:2]) + assert allclose(b[0:2, 0:3].toarray(), x[0:2, 0:3]) + assert allclose(b[0:2, 0:3:2].toarray(), x[0:2, 0:3:2]) + assert allclose(b[:2, :2].toarray(), x[:2, :2]) + assert allclose(b[1:, 1:].toarray(), x[1:, 1:]) + assert allclose(b[5:1:-1, 5:1:-1].toarray(), x[5:1:-1, 5:1:-1]) + assert allclose(b[10:-10:-2, 10:-10:-2].toarray(), x[10:-10:-2, 10:-10:-2]) + assert allclose(b[-5:-1, -5:-1].toarray(), x[-5:-1, -5:-1]) + assert allclose(b[-1:-5:-2, -1:-5:-2].toarray(), x[-1:-5:-2, -1:-5:-2]) + + b = array(x, sc, axis=(0, 1)) + assert allclose(b[0:1, 0:1].toarray(), x[0:1, 0:1]) + assert allclose(b[0:2, 0:2].toarray(), x[0:2, 0:2]) + assert allclose(b[0:2, 0:3].toarray(), x[0:2, 0:3]) + assert allclose(b[0:2, 0:3:2].toarray(), x[0:2, 0:3:2]) + assert allclose(b[:2, :2].toarray(), x[:2, :2]) + assert allclose(b[1:, 1:].toarray(), x[1:, 1:]) + assert allclose(b[5:1:-1, 5:1:-1].toarray(), x[5:1:-1, 5:1:-1]) + assert allclose(b[10:-10:-2, 10:-10:-2].toarray(), x[10:-10:-2, 10:-10:-2]) + assert allclose(b[-5:-1, -5:-1].toarray(), x[-5:-1, -5:-1]) + assert allclose(b[-1:-5:-2, -1:-5:-2].toarray(), x[-1:-5:-2, -1:-5:-2]) + +def test_getitem_slice_ragged(sc): + + x = arange(10*10*3).reshape((10, 10, 3)) + + b = array(x, sc, axis=(0,1)) + assert allclose(b[0:5:2, 0:2].toarray(), x[0:5:2, 0:2]) + assert allclose(b[0:5:3, 0:2].toarray(), x[0:5:3, 0:2]) + assert allclose(b[0:9:3, 0:2].toarray(), x[0:9:3, 0:2]) + +def test_getitem_int(sc): + + x = arange(2*3).reshape((2, 3)) + + b = array(x, sc, axis=0) + assert allclose(b[0, 0], x[0, 0]) + assert allclose(b[0, 1], x[0, 1]) + assert allclose(b[0, 0:1], x[0, 0:1]) + assert allclose(b[1, 2], x[1, 2]) + assert allclose(b[0], x[0]) + assert allclose(b[[0]], x[[0]]) + assert allclose(b[(0)], x[(0)]) + assert allclose(b[[1], [2]], x[[1], [2]]) + assert allclose(b[[1], 2], x[[1], 2]) + assert allclose(b[-1, -2], x[-1, -2]) + + b = array(x, sc, axis=(0, 1)) + assert allclose(b[0, 0], x[0, 0]) + assert allclose(b[0, 1], x[0, 1]) + assert allclose(b[0, 0:1], x[0, 0:1]) + assert allclose(b[1, 2], x[1, 2]) + assert allclose(b[0], x[0]) + assert allclose(b[[0]], x[[0]]) + assert allclose(b[(0)], x[(0)]) + assert allclose(b[[1], [2]], x[[1], [2]]) + assert allclose(b[[1], 2], x[[1], 2]) + assert allclose(b[-1, -2], x[-1, -2]) + +def test_getitem_list(sc): + + x = arange(3*3*4).reshape((3, 3, 4)) + + b = array(x, sc, axis=0) + assert allclose(b[[0, 1], [0, 1], [0, 2]].toarray(), x[[0, 1], [0, 1], [0, 2]]) + assert allclose(b[[0, 1], [0, 2], [0, 3]].toarray(), x[[0, 1], [0, 2], [0, 3]]) + assert allclose(b[[0, 1, 2], [0, 2, 1], [0, 3, 1]].toarray(), x[[0, 1, 2], [0, 2, 1], [0, 3, 1]]) + + b = array(x, sc, axis=(0,1)) + assert allclose(b[[0, 1], [0, 1], [0, 2]].toarray(), x[[0, 1], [0, 1], [0, 2]]) + assert allclose(b[[0, 1], [0, 2], [0, 3]].toarray(), x[[0, 1], [0, 2], [0, 3]]) + assert allclose(b[[0, 1, 2], [0, 2, 1], [0, 3, 1]].toarray(), x[[0, 1, 2], [0, 2, 1], [0, 3, 1]]) + +def test_getitem_list_array(sc): + + x = arange(3*3*4).reshape((3, 3, 4)) + + rows = [[0, 0], [1, 1]] + cols = [[0, 2], [0, 2]] + dept = [[0, 3], [0, 3]] + + b = array(x, sc, axis=0) + assert allclose(b[rows, cols, dept].toarray(), x[rows, cols, dept]) + + b = array(x, sc, axis=(0, 1)) + assert allclose(b[rows, cols, dept].toarray(), x[rows, cols, dept]) + +def test_getitem_mixed(sc): + + x = arange(4*4*4*4).reshape(4, 4, 4, 4) + b = array(x, sc, axis=(0, 1)) + + i = [0, 1] + s = slice(1, 3) + assert allclose(b[i, :, :, :].toarray(), x[i, :, :, :]) + assert allclose(b[i, s, s, s].toarray(), x[i, s, s, s]) + assert allclose(b[:, :, i, :].toarray(), x[:, :, i, :]) + assert allclose(b[s, s, i, s].toarray(), x[s, s, i, s]) + + i = [1] + assert allclose(b[i, :, :, :].toarray(), x[i, :, :, :]) + assert allclose(b[:, :, i, :].toarray(), x[:, :, i, :]) + + i = [[0, 1], [1, 0]] + with pytest.raises(ValueError): + b[i, :, :, :] + +def test_bounds(sc): + + x = arange(5) + b = array(x, sc) + + # out of bounds + with pytest.raises(ValueError): + b[5] + + with pytest.raises(ValueError): + b[-6] + + with pytest.raises(ValueError): + b[[1,5]] + + # slicing that would produce an empty dimension + with pytest.raises(ValueError): + b[3:2] + + with pytest.raises(ValueError): + b[5:] + + with pytest.raises(ValueError): + b[-6:0] + +def test_squeeze(sc): + + from numpy import ones as npones + + x = npones((1, 2, 1, 4)) + b = ones((1, 2, 1, 4), sc, axis=0) + assert allclose(b.squeeze().toarray(), x.squeeze()) + assert allclose(b.squeeze((0, 2)).toarray(), x.squeeze((0, 2))) + assert allclose(b.squeeze(0).toarray(), x.squeeze(0)) + assert allclose(b.squeeze(2).toarray(), x.squeeze(2)) + assert b.squeeze().split == 0 + assert b.squeeze((0, 2)).split == 0 + assert b.squeeze(2).split == 1 + + x = npones((1, 2, 1, 4)) + b = ones((1, 2, 1, 4), sc, axis=(0, 1)) + assert allclose(b.squeeze().toarray(), x.squeeze()) + assert allclose(b.squeeze((0, 2)).toarray(), x.squeeze((0, 2))) + assert allclose(b.squeeze(0).toarray(), x.squeeze(0)) + assert allclose(b.squeeze(2).toarray(), x.squeeze(2)) + assert b.squeeze().split == 1 + assert b.squeeze((0, 2)).split == 1 + assert b.squeeze(2).split == 2 + + x = npones((1, 1, 1, 1)) + b = ones((1, 1, 1, 1), sc, axis=(0, 1)) + assert allclose(b.squeeze().toarray(), x.squeeze()) diff --git a/test/test_spark_shaping.py b/test/test_spark_shaping.py new file mode 100644 index 0000000..6c1690d --- /dev/null +++ b/test/test_spark_shaping.py @@ -0,0 +1,247 @@ +import pytest +from numpy import arange, prod +from itertools import permutations +from bolt import array +from bolt.utils import allclose + +def test_value_shape(sc): + + x = arange(2*3).reshape((2, 3)) + b = array(x, sc) + assert b.values.shape == (3,) + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=0) + assert b.values.shape == (3, 4) + +def test_key_shape(sc): + + x = arange(2*3).reshape((2, 3)) + b = array(x, sc) + assert b.keys.shape == (2,) + + x = arange(2*3*4).reshape((2, 3, 4)) + b = array(x, sc, axis=(0, 1)) + assert b.keys.shape == (2, 3) + +def test_reshape_keys(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0, 1)) + c = b.keys.reshape((3, 2)) + assert c.keys.shape == (3, 2) + assert allclose(c.toarray(), x.reshape((3, 2, 4))) + + b = array(x, sc, axis=0) + c = b.keys.reshape((2, 1)) + assert allclose(c.toarray(), x.reshape((2, 1, 3, 4))) + + b = array(x, sc, axis=(0,)) + c = b.keys.reshape((2,)) + assert allclose(c.toarray(), x) + + b = array(x, sc, axis=(0, 1)) + c = b.keys.reshape((2, 3)) + assert allclose(c.toarray(), x) + +def test_reshape_keys_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0, 1)) + with pytest.raises(ValueError): + b.keys.reshape((2, 3, 4)) + +def test_reshape_values(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0,)) + c = b.values.reshape((4, 3)) + assert c.values.shape == (4, 3) + assert allclose(c.toarray(), x.reshape((2, 4, 3))) + + b = array(x, sc, axis=(0, 1)) + c = b.values.reshape((1, 4)) + assert allclose(c.toarray(), x.reshape((2, 3, 1, 4))) + + b = array(x, sc, axis=(0, 1)) + c = b.values.reshape((4,)) + assert allclose(c.toarray(), x) + + b = array(x, sc, axis=0) + c = b.values.reshape((3, 4)) + assert allclose(c.toarray(), x) + +def test_reshape_values_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0, 1)) + with pytest.raises(ValueError): + b.values.reshape((2, 3, 4)) + +def test_transpose_keys(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0, 1)) + c = b.keys.transpose((1, 0)) + assert c.keys.shape == (3, 2) + assert allclose(c.toarray(), x.transpose((1, 0, 2))) + + b = array(x, sc, axis=0) + c = b.keys.transpose((0,)) + assert allclose(c.toarray(), x) + + b = array(x, sc, axis=(0, 1)) + c = b.keys.transpose((0, 1)) + assert allclose(c.toarray(), x) + +def test_transpose_keys_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=(0, 1)) + with pytest.raises(ValueError): + b.keys.transpose((0, 2)) + + with pytest.raises(ValueError): + b.keys.transpose((1, 1)) + + with pytest.raises(ValueError): + b.keys.transpose((0,)) + +def test_transpose_values(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=0) + c = b.values.transpose((1, 0)) + assert c.values.shape == (4, 3) + assert allclose(c.toarray(), x.transpose((0, 2, 1))) + + b = array(x, sc, axis=0) + c = b.values.transpose((0, 1)) + assert allclose(c.toarray(), x) + + b = array(x, sc, axis=(0, 1)) + c = b.values.transpose((0,)) + assert allclose(c.toarray(), x.reshape((2, 3, 4))) + +def test_traspose_values_errors(sc): + + x = arange(2*3*4).reshape((2, 3, 4)) + + b = array(x, sc, axis=0) + with pytest.raises(ValueError): + b.values.transpose((0, 2)) + + with pytest.raises(ValueError): + b.values.transpose((1, 1)) + + with pytest.raises(ValueError): + b.values.transpose((0,)) + + +def test_swap(sc): + + a = arange(2**8).reshape(*(8*[2])) + b = array(a, sc, axis=(0, 1, 2, 3)) + + bs = b.swap((1, 2), (0, 3), size=(2, 2)) + at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) + assert allclose(at, bs.toarray()) + + bs = b.swap((1, 2), (0, 3), size="50") + at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) + assert allclose(at, bs.toarray()) + + bs = b.swap((1, 2), (0, 3)) + at = a.transpose((0, 3, 4, 7, 1, 2, 5, 6)) + assert allclose(at, bs.toarray()) + + bs = b.swap((), (0, 1, 2, 3)) + at = a + assert allclose(at, bs.toarray()) + + bs = b.swap(0, 0) + at = a.transpose((1, 2, 3, 4, 0, 5, 6, 7)) + assert allclose(at, bs.toarray()) + + bs = b.swap([], 0) + at = a.transpose((0, 1, 2, 3, 4, 5, 6, 7)) + assert allclose(at, bs.toarray()) + assert bs.split == 5 + + bs = b.swap(0, []) + at = a.transpose((1, 2, 3, 0, 4, 5, 6, 7)) + assert allclose(at, bs.toarray()) + assert bs.split == 3 + + b = array(a, sc, axis=range(8)) + bs = b.swap([0,1], []) + at = a.transpose((2, 3, 4, 5, 6, 7, 0, 1)) + assert allclose(at, bs.toarray()) + assert bs.split == 6 + + a = arange(2*3*4).reshape(2, 3, 4) + b = array(a, sc, axis=(0,)) + + bs = b.swap((0,), (0, 1)) + at = a.transpose(1, 2, 0) + assert allclose(at, bs.toarray()) + + +def test_transpose(sc): + + n = 4 + perms = list(permutations(range(n), n)) + + a = arange(2*3*4*5).reshape((2, 3, 4, 5)) + + b = array(a, sc, axis=(0, 1)) + for p in perms: + assert allclose(b.transpose(p).toarray(), b.toarray().transpose(p)) + + assert allclose(b.transpose(), b.toarray().transpose()) + +def test_t(sc): + + a = arange(2*3*4*5).reshape((2, 3, 4, 5)) + + b = array(a, sc, axis=0) + assert allclose(b.T.toarray(), b.toarray().T) + + b = array(a, sc, axis=(0, 1)) + assert allclose(b.T.toarray(), b.toarray().T) + +def test_swapaxes(sc): + + a = arange(2*3*4*5).reshape((2, 3, 4, 5)) + + b = array(a, sc, axis=(0, 1)) + assert allclose(b.swapaxes(1, 2).toarray(), b.toarray().swapaxes(1, 2)) + assert allclose(b.swapaxes(0, 1).toarray(), b.toarray().swapaxes(0, 1)) + assert allclose(b.swapaxes(2, 3).toarray(), b.toarray().swapaxes(2, 3)) + +def test_reshape(sc): + + old_shape = (6, 10, 4, 12) + a = arange(prod(old_shape)).reshape(old_shape) + b = array(a, sc, axis=(0, 1)) + + # keys only + new_shape = (15, 4, 4, 12) + assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) + # values only + new_shape = (6, 10, 24, 2) + assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) + # keys and values, independent + new_shape = (15, 4, 24, 2) + assert allclose(b.reshape(new_shape).toarray(), b.toarray().reshape(new_shape)) + # keys and values, mixing + new_shape = (6, 4, 10, 12) + with pytest.raises(NotImplementedError): + b.reshape(new_shape) diff --git a/test/test_spark_stacking.py b/test/test_spark_stacking.py new file mode 100644 index 0000000..2e146a9 --- /dev/null +++ b/test/test_spark_stacking.py @@ -0,0 +1,133 @@ +import pytest +from numpy import arange, repeat, asarray, vstack, tile +from bolt import array, ones +from bolt.utils import allclose +from bolt.array.array import BoltArraySpark + + +def _2D_stackable_preamble(sc, num_partitions=2): + + dims = (10, 10) + arr = vstack([[x]*dims[1] for x in arange(dims[0])]) + barr = array(arr, sc, axis=0) + barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), + shape=barr.shape, split=barr.split) + return barr + +def _3D_stackable_preamble(sc, num_partitions=2): + + dims = (10, 10, 10) + area = dims[0] * dims[1] + arr = asarray([repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) + barr = array(arr, sc, axis=0) + barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), + shape=barr.shape, split=barr.split) + return barr + +def test_stack_2D(sc): + + barr = _2D_stackable_preamble(sc) + + # without stack_size + stacked = barr.stack() + first_partition = stacked._rdd.first()[1] + assert first_partition.shape == (5, 10) + assert stacked.shape == (10, 10) + + # with stack_size + stacked = barr.stack(size=2) + first_partition = stacked._rdd.first()[1] + assert first_partition.shape == (2, 10) + + # invalid stack_size + stacked = barr.stack(size=0) + first_partition = stacked._rdd.first()[1] + assert first_partition.shape == (5, 10) + + # unstacking + unstacked = stacked.unstack() + arr = unstacked.toarray() + assert arr.shape == (10, 10) + assert allclose(arr, barr.toarray()) + +def test_stack_3D(sc): + + barr = _3D_stackable_preamble(sc) + + # with stack_size + stacked = barr.stack(size=2) + first_partition = stacked._rdd.first()[1] + assert first_partition.shape == (2, 10, 10) + + # invalid stack_size + stacked = barr.stack(size=0) + first_partition = stacked._rdd.first()[1] + assert first_partition.shape == (5, 10, 10) + + # unstacking + unstacked = stacked.unstack() + arr = unstacked.toarray() + assert arr.shape == (10, 10, 10) + assert allclose(arr, barr.toarray()) + +def test_stacked_map(sc): + + barr = _2D_stackable_preamble(sc) + + map_func1 = lambda x: x * 2 + + funcs = [map_func1] + + for func in funcs: + stacked = barr.stack() + stacked_map = stacked.map(func) + normal_map = barr.map(func) + unstacked = stacked_map.unstack() + assert normal_map.shape == unstacked.shape + assert normal_map.split == unstacked.split + assert allclose(normal_map.toarray(), unstacked.toarray()) + +def test_stacked_shape_inference(sc): + + from numpy import ones as npones + + a = ones((100, 2), sc) + a._rdd = a._rdd.partitionBy(2) + s = a.stack(5) + n = s.tordd().count() + + # operations that preserve keys + assert s.map(lambda x: x * 2).unstack().shape == (100, 2) + assert s.map(lambda x: x.sum(axis=1)).unstack().shape == (100,) + assert s.map(lambda x: tile(x, (1, 2))).unstack().shape == (100, 4) + + # operations that create new keys + assert s.map(lambda x: npones((2, 2))).unstack().shape == (n, 2, 2) + assert s.map(lambda x: x.sum(axis=0)).unstack().shape == (n, 2) + assert s.map(lambda x: asarray([2])).unstack().toarray().shape == (n, 1) + assert s.map(lambda x: asarray(2)).unstack().toarray().shape == (n,) + + # composing functions works + assert s.map(lambda x: x * 2).map(lambda x: x * 2).unstack().shape == (100, 2) + assert s.map(lambda x: x * 2).map(lambda x: npones((2, 2))).unstack().shape == (n, 2, 2) + assert s.map(lambda x: npones((2, 2))).map(lambda x: x * 2).unstack().shape == (n, 2, 2) + + # check the result + assert allclose(s.map(lambda x: x.sum(axis=1)).unstack().toarray(), npones(100) * 2) + assert allclose(s.map(lambda x: tile(x, (1, 2))).unstack().toarray(), npones((100, 4))) + + with pytest.raises(ValueError): + s.map(lambda x: 2) + + with pytest.raises(ValueError): + s.map(lambda x: None) + + with pytest.raises(RuntimeError): + s.map(lambda x: 1/0) + +def test_stacked_conversion(sc): + + from pyspark import RDD + barr = _2D_stackable_preamble(sc) + k1 = barr.tordd().keys() + assert isinstance(k1, RDD) From 92dfb77f277af884251a646e6b965feaa3819d5a Mon Sep 17 00:00:00 2001 From: jwittenbach Date: Fri, 6 Jan 2017 14:34:06 -0500 Subject: [PATCH 3/4] removes BoltArray base class as well as BoltArray.mode attribute --- bolt/array/array.py | 40 ++++++--- bolt/array/base.py | 158 ---------------------------------- bolt/array/chunk.py | 10 +-- bolt/array/construct.py | 26 +++--- bolt/array/shapes.py | 32 +++---- bolt/array/stack.py | 8 +- test/generic.py | 20 ++--- test/test_spark_construct.py | 10 +-- test/test_spark_functional.py | 12 +-- test/test_spark_stacking.py | 6 +- 10 files changed, 87 insertions(+), 235 deletions(-) delete mode 100644 bolt/array/base.py diff --git a/bolt/array/array.py b/bolt/array/array.py index 8915b60..adeef53 100644 --- a/bolt/array/array.py +++ b/bolt/array/array.py @@ -3,14 +3,13 @@ r_, sort, argsort, array, random, arange, ones, expand_dims, sum from itertools import groupby -from bolt.array.base import BoltArray from bolt.array.stack import StackedArray from bolt.array.utils import zip_with_index from bolt.array.statcounter import StatCounter from bolt.utils import slicify, listify, tupleize, argpack, inshape, istransposeable, isreshapeable -class BoltArraySpark(BoltArray): +class BoltArray(object): _metadata = { '_shape': None, @@ -24,12 +23,25 @@ def __init__(self, rdd, shape=None, split=None, dtype=None, ordered=True): self._shape = shape self._split = split self._dtype = dtype - self._mode = 'spark' self._ordered = ordered + def __finalize__(self, other): + if isinstance(other, BoltArray): + for name in self._metadata: + other_attr = getattr(other, name, None) + if (other_attr is not self._metadata[name]) \ + and (getattr(self, name, None) is self._metadata[name]): + object.__setattr__(self, name, other_attr) + return self + + def __repr__(self): + s = "BoltArray\n" + s += "shape: %s\n" % str(self.shape) + return s + @property def _constructor(self): - return BoltArraySpark + return BoltArray def __array__(self): return self.toarray() @@ -98,7 +110,7 @@ def _align(self, axis): Returns ------- - BoltArraySpark + BoltArray """ # ensure that the specified axes are valid inshape(self.shape, axis) @@ -149,7 +161,7 @@ def map(self, func, axis=(0,), value_shape=None, dtype=None, with_keys=False): Returns ------- - BoltArraySpark + BoltArray """ axis = tupleize(axis) swapped = self._align(axis) @@ -212,7 +224,7 @@ def filter(self, func, axis=(0,), sort=False): Returns ------- - BoltArraySpark + BoltArray """ axis = tupleize(axis) @@ -259,7 +271,7 @@ def reduce(self, func, axis=(0,), keepdims=False): Returns ------- - BoltArraySpark + BoltArray """ from numpy import ndarray @@ -294,7 +306,7 @@ def _stat(self, axis=None, func=None, name=None, keepdims=False): will compute over all axes func : function, optional, default=None - Function for reduce, see BoltArraySpark.reduce + Function for reduce, see BoltArray.reduce name : str A named statistic, see StatCounter @@ -429,7 +441,7 @@ def concatenate(self, arry, axis=0): Paramters --------- - arry : ndarray, BoltArrayLocal, or BoltArraySpark + arry : ndarray, or BoltArray Another array to concatenate with axis : int, optional, default=0 @@ -437,13 +449,13 @@ def concatenate(self, arry, axis=0): Returns ------- - BoltArraySpark + BoltArray """ if isinstance(arry, ndarray): from bolt.array.construct import array arry = array(arry, self._rdd.context, axis=range(0, self.split)) else: - if not isinstance(arry, BoltArraySpark): + if not isinstance(arry, BoltArray): raise ValueError("other must be local array or spark array, got %s" % type(arry)) if not all([x == y if not i == axis else True @@ -736,7 +748,7 @@ def swap(self, kaxes, vaxes, size="150"): Returns ------- - BoltArraySpark + BoltArray """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') @@ -850,7 +862,7 @@ def reshape(self, *shape): i = self._reshapebasic(new) if i == -1: raise NotImplementedError("Currently no support for reshaping between " - "keys and values for BoltArraySpark") + "keys and values for BoltArray") else: new_key_shape, new_value_shape = new[:i], new[i:] return self.keys.reshape(new_key_shape).values.reshape(new_value_shape) diff --git a/bolt/array/base.py b/bolt/array/base.py deleted file mode 100644 index 240d926..0000000 --- a/bolt/array/base.py +++ /dev/null @@ -1,158 +0,0 @@ -class BoltArray(object): - - _mode = None - _metadata = {} - - def __finalize__(self, other): - if isinstance(other, BoltArray): - for name in self._metadata: - other_attr = getattr(other, name, None) - if (other_attr is not self._metadata[name]) \ - and (getattr(self, name, None) is self._metadata[name]): - object.__setattr__(self, name, other_attr) - return self - - @property - def mode(self): - return self._mode - - @property - def shape(self): - """ - Size of each dimension. - """ - raise NotImplementedError - - @property - def size(self): - """ - Total number of elements. - """ - raise NotImplementedError - - @property - def ndim(self): - """ - Number of dimensions. - """ - raise NotImplementedError - - @property - def dtype(self): - """ - Data-type of array. - """ - raise NotImplementedError - - @property - def _constructor(self): - return None - - def sum(self, axis): - """ - Return the sum of the array elements over the given axis. - """ - raise NotImplementedError - - def mean(self, axis): - """ - Return the mean of the array elements over the given axis. - """ - raise NotImplementedError - - def var(self, axis): - """ - Return the variance of the array elements over the given axis. - """ - raise NotImplementedError - - def std(self, axis): - """ - Return the standard deviation of the array elements over the given axis. - """ - raise NotImplementedError - - def min(self, axis): - """ - Return the minimum of the array elements over the given axis or axes. - """ - raise NotImplementedError - - def max(self, axis): - """ - Return the maximum of the array elements over the given axis or axes. - """ - raise NotImplementedError - - def concatenate(self, arry, axis): - raise NotImplementedError - - def transpose(self, axis): - """ - Return an array with the axes transposed. - """ - raise NotImplementedError - - @property - def T(self): - """ - Transpose by reversing the order of the axes. - """ - raise NotImplementedError - - def reshape(self, axis): - """ - Return an array with the same data but a new shape. - """ - raise NotImplementedError - - def squeeze(self, axis): - """ - Remove one or more single-dimensional axes from the array. - """ - raise NotImplementedError - - def swapaxes(self, axis1, axis2): - """ - Return an array with two axes interchanged. - """ - raise NotImplementedError - - def astype(self, dtype, casting): - """ - Cast the array to a specified type. - """ - raise NotImplementedError - - def __getitem__(self, index): - raise NotImplementedError - - def map(self, func, axis): - """ - Apply a function across one or more axes. - """ - raise NotImplementedError - - def reduce(self, func, axis, keepdims): - """ - Reduce an array across one or more axes. - """ - raise NotImplementedError - - def filter(self, func, axis): - """ - Filter an array across one or more axes. - """ - raise NotImplementedError - - def first(self): - """ - Return the first element of the array - """ - raise NotImplementedError - - def __repr__(self): - s = "BoltArray\n" - s += "mode: %s\n" % self._mode - s += "shape: %s\n" % str(self.shape) - return s diff --git a/bolt/array/chunk.py b/bolt/array/chunk.py index 9f0f7a9..e8cfae9 100644 --- a/bolt/array/chunk.py +++ b/bolt/array/chunk.py @@ -5,12 +5,12 @@ from itertools import product from bolt.utils import tuplesort, tupleize, allstack, iterexpand -from bolt.array.array import BoltArraySpark +from bolt.array.array import BoltArray class ChunkedArray(object): """ - Wraps a BoltArraySpark and provides an interface for chunking + Wraps a BoltArray and provides an interface for chunking into subarrays and performing operations on chunks. Many methods will be restricted until the chunked array is unchunked. @@ -196,7 +196,7 @@ def _unchunk(it): else: newshape = self.shape - return BoltArraySpark(rdd, shape=newshape, split=self._split, + return BoltArray(rdd, shape=newshape, split=self._split, dtype=self.dtype, ordered=ordered) def keys_to_values(self, axes, size=None): @@ -416,7 +416,7 @@ def map_generic(self, func): """ Apply a generic array -> object to each subarray - The resulting object is a BoltArraySpark of dtype object where the + The resulting object is a BoltArray of dtype object where the blocked dimensions are replaced with indices indication block ID. """ def process_record(val): @@ -429,7 +429,7 @@ def process_record(val): nchunks = self.getnumber(self.plan, self.vshape) newshape = tuple([int(s) for s in r_[self.kshape, nchunks]]) newsplit = len(self.shape) - return BoltArraySpark(rdd, shape=newshape, split=newsplit, ordered=self._ordered, dtype="object") + return BoltArray(rdd, shape=newshape, split=newsplit, ordered=self._ordered, dtype="object") def getplan(self, size="150", axes=None, padding=None): """ diff --git a/bolt/array/construct.py b/bolt/array/construct.py index 429ecc9..74cb6bd 100644 --- a/bolt/array/construct.py +++ b/bolt/array/construct.py @@ -2,7 +2,7 @@ from itertools import product -from bolt.array.array import BoltArraySpark +from bolt.array.array import BoltArray from bolt.array.utils import get_kv_shape, get_kv_axes @@ -34,7 +34,7 @@ def array(a, context=None, axis=(0,), dtype=None, npartitions=None): Returns ------- - BoltArraySpark + BoltArray """ if dtype is None: arry = asarray(a) @@ -63,7 +63,7 @@ def array(a, context=None, axis=(0,), dtype=None, npartitions=None): vals = arry.reshape((prod(key_shape),) + val_shape) rdd = context.parallelize(zip(keys, vals), npartitions) - return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) + return BoltArray(rdd, shape=shape, split=split, dtype=dtype) def ones(shape, context=None, axis=(0,), dtype=float64, npartitions=None): """ @@ -91,7 +91,7 @@ def ones(shape, context=None, axis=(0,), dtype=float64, npartitions=None): Returns ------- - BoltArraySpark + BoltArray """ from numpy import ones return _wrap(ones, shape, context, axis, dtype, npartitions) @@ -122,7 +122,7 @@ def zeros(shape, context=None, axis=(0,), dtype=float64, npartitions=None): Returns ------- - BoltArraySpark + BoltArray """ from numpy import zeros return _wrap(zeros, shape, context, axis, dtype, npartitions) @@ -143,7 +143,7 @@ def concatenate(arrays, axis=0): Returns ------- - BoltArraySpark + BoltArray """ if not isinstance(arrays, tuple): raise ValueError("data type not understood") @@ -151,9 +151,9 @@ def concatenate(arrays, axis=0): raise NotImplementedError("spark concatenation only supports two arrays") first, second = arrays - if isinstance(first, BoltArraySpark): + if isinstance(first, BoltArray): return first.concatenate(second, axis) - elif isinstance(second, BoltArraySpark): + elif isinstance(second, BoltArray): first = array(first, second._rdd.context) return first.concatenate(second, axis) else: @@ -166,8 +166,8 @@ def _argcheck(*args, **kwargs): Conditions are: (1) a positional argument is a SparkContext (2) keyword arg 'context' is a SparkContext - (3) an argument is a BoltArraySpark, or - (4) an argument is a nested list containing a BoltArraySpark + (3) an argument is a BoltArray, or + (4) an argument is a nested list containing a BoltArray """ try: from pyspark import SparkContext @@ -176,8 +176,8 @@ def _argcheck(*args, **kwargs): cond1 = any([isinstance(arg, SparkContext) for arg in args]) cond2 = isinstance(kwargs.get('context', None), SparkContext) - cond3 = any([isinstance(arg, BoltArraySpark) for arg in args]) - cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg]) + cond3 = any([isinstance(arg, BoltArray) for arg in args]) + cond4 = any([any([isinstance(sub, BoltArray) for sub in arg]) if isinstance(arg, (tuple, list)) else False for arg in args]) return cond1 or cond2 or cond3 or cond4 @@ -209,4 +209,4 @@ def _wrap(func, shape, context=None, axis=(0,), dtype=None, npartitions=None): # use a map to make the arrays in parallel rdd = rdd.map(lambda x: (x, func(value_shape, dtype, order='C'))) - return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype) + return BoltArray(rdd, shape=shape, split=split, dtype=dtype) diff --git a/bolt/array/shapes.py b/bolt/array/shapes.py index 5bd92e4..78794d7 100644 --- a/bolt/array/shapes.py +++ b/bolt/array/shapes.py @@ -1,12 +1,12 @@ from numpy import unravel_index, ravel_multi_index from bolt.utils import argpack, istransposeable, isreshapeable -from bolt.array.array import BoltArraySpark +from bolt.array.array import BoltArray class Shapes(object): """ - Base Shape class. These classes wrap a BoltArraySpark in their + Base Shape class. These classes wrap a BoltArray in their entirity, but implement the following attributes and methods as if they were only working on the keys or the values, depending which subclass is used. @@ -28,7 +28,7 @@ def transpose(self): class Keys(Shapes): """ This class implements all the base shape attributes and methods - for the keys of a BoltArraySpark. + for the keys of a BoltArray. """ def __init__(self, barray): self._barray = barray @@ -39,8 +39,8 @@ def shape(self): def reshape(self, *shape): """ - Reshape just the keys of a BoltArraySpark, returning a - new BoltArraySpark. + Reshape just the keys of a BoltArray, returning a + new BoltArray. Parameters ---------- @@ -61,12 +61,12 @@ def f(k): newsplit = len(new) newshape = new + self._barray.values.shape - return BoltArraySpark(newrdd, shape=newshape, split=newsplit).__finalize__(self._barray) + return BoltArray(newrdd, shape=newshape, split=newsplit).__finalize__(self._barray) def transpose(self, *axes): """ - Transpose just the keys of a BoltArraySpark, returning a - new BoltArraySpark. + Transpose just the keys of a BoltArray, returning a + new BoltArray. Parameters ---------- @@ -86,7 +86,7 @@ def f(k): newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) newshape = tuple(self.shape[i] for i in new) + self._barray.values.shape - return BoltArraySpark(newrdd, shape=newshape, ordered=False).__finalize__(self._barray) + return BoltArray(newrdd, shape=newshape, ordered=False).__finalize__(self._barray) def __str__(self): s = "BoltArray Keys\n" @@ -99,7 +99,7 @@ def __repr__(self): class Values(Shapes): """ This class implements all the base shape attributes and methods - for the values of a BoltArraySpark. + for the values of a BoltArray. """ def __init__(self, barray): self._barray = barray @@ -110,8 +110,8 @@ def shape(self): def reshape(self, *shape): """ - Reshape just the values of a BoltArraySpark, returning a - new BoltArraySpark. + Reshape just the values of a BoltArray, returning a + new BoltArray. Parameters ---------- @@ -131,12 +131,12 @@ def f(v): newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + new - return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) + return BoltArray(newrdd, shape=newshape).__finalize__(self._barray) def transpose(self, *axes): """ - Transpose just the values of a BoltArraySpark, returning a - new BoltArraySpark. + Transpose just the values of a BoltArray, returning a + new BoltArray. Parameters ---------- @@ -156,7 +156,7 @@ def f(v): newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new) - return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray) + return BoltArray(newrdd, shape=newshape).__finalize__(self._barray) def __str__(self): s = "BoltArray Values\n" diff --git a/bolt/array/stack.py b/bolt/array/stack.py index 6071389..9858390 100644 --- a/bolt/array/stack.py +++ b/bolt/array/stack.py @@ -3,7 +3,7 @@ class StackedArray(object): """ - Wraps a BoltArraySpark and provides an interface for performing + Wraps a BoltArray and provides an interface for performing stacked operations (operations on aggregated subarrays). Many methods will be restricted or forbidden until the Stacked object is unstacked. Currently, only map() is implemented. The rationale @@ -69,16 +69,16 @@ def tostacks(partition): def unstack(self): """ - Unstack array and return a new BoltArraySpark via flatMap(). + Unstack array and return a new BoltArray via flatMap(). """ - from bolt.array.array import BoltArraySpark + from bolt.array.array import BoltArray if self._rekeyed: rdd = self._rdd else: rdd = self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))) - return BoltArraySpark(rdd, shape=self.shape, split=self.split) + return BoltArray(rdd, shape=self.shape, split=self.split) def map(self, func): """ diff --git a/test/generic.py b/test/generic.py index a94010a..4ab46c1 100644 --- a/test/generic.py +++ b/test/generic.py @@ -70,12 +70,11 @@ def nonuniform_map(x): res = mapped.toarray() # check that changes in dtype are correctly handled - if b.mode == 'spark': - func3 = lambda x: x.astype('float32') - mapped = b.map(func3, axis=0) - assert mapped.dtype == dtype('float32') - mapped = b.map(func3, axis=0, dtype=dtype('float32')) - assert mapped.dtype == dtype('float32') + func3 = lambda x: x.astype('float32') + mapped = b.map(func3, axis=0) + assert mapped.dtype == dtype('float32') + mapped = b.map(func3, axis=0, dtype=dtype('float32')) + assert mapped.dtype == dtype('float32') def reduce_suite(arr, b): """ @@ -151,11 +150,10 @@ def filter_half(x): assert res.shape[0] <= b.shape[0] # rerun with sorting - if not b.mode == "local": - filtered = b.filter(lambda x: filter_half(x) < 0.5, sort=True) - res = filtered.toarray() - assert res.shape[1:] == b.shape[1:] - assert res.shape[0] <= b.shape[0] + filtered = b.filter(lambda x: filter_half(x) < 0.5, sort=True) + res = filtered.toarray() + assert res.shape[1:] == b.shape[1:] + assert res.shape[0] <= b.shape[0] # filter out half of the values over the second axis filtered = b.filter(lambda x: filter_half(x) < 0.5, axis=1) diff --git a/test/test_spark_construct.py b/test/test_spark_construct.py index a8e43e4..e371c36 100644 --- a/test/test_spark_construct.py +++ b/test/test_spark_construct.py @@ -2,26 +2,26 @@ from numpy import arange from bolt import array, ones, zeros, concatenate from bolt.utils import allclose -from bolt.array.array import BoltArraySpark +from bolt.array.array import BoltArray def test_array(sc): x = arange(2*3*4).reshape((2, 3, 4)) b = array(x, sc) - assert isinstance(b, BoltArraySpark) + assert isinstance(b, BoltArray) assert allclose(x, b.toarray()) b = array(x, sc, axis=0) - assert isinstance(b, BoltArraySpark) + assert isinstance(b, BoltArray) assert allclose(x, b.toarray()) b = array(x, sc, axis=(0, 1)) - assert isinstance(b, BoltArraySpark) + assert isinstance(b, BoltArray) assert allclose(x, b.toarray()) b = array(x, sc, axis=(0, 1), npartitions=5) - assert isinstance(b, BoltArraySpark) + assert isinstance(b, BoltArray) assert allclose(x, b.toarray()) assert b.tordd().getNumPartitions() == 5 diff --git a/test/test_spark_functional.py b/test/test_spark_functional.py index ad778dd..5bc22ee 100644 --- a/test/test_spark_functional.py +++ b/test/test_spark_functional.py @@ -14,11 +14,11 @@ def test_map(sc): # Test all map functionality when the base array is split after the first axis generic.map_suite(x, b) - # Split the BoltArraySpark after the second axis and rerun the tests + # Split the BoltArray after the second axis and rerun the tests b = array(x, sc, axis=(0, 1)) generic.map_suite(x, b) - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + # Split the BoltArray after the third axis (scalar values) and rerun the tests b = array(x, sc, axis=(0, 1, 2)) generic.map_suite(x, b) @@ -39,11 +39,11 @@ def test_reduce(sc): # Test all reduce functionality when the base array is split after the first axis generic.reduce_suite(arr, b) - # Split the BoltArraySpark after the second axis and rerun the tests + # Split the BoltArray after the second axis and rerun the tests b = array(arr, sc, axis=(0, 1)) generic.reduce_suite(arr, b) - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + # Split the BoltArray after the third axis (scalar values) and rerun the tests b = array(arr, sc, axis=(0, 1, 2)) generic.reduce_suite(arr, b) @@ -55,11 +55,11 @@ def test_filter(sc): # Test all filter functionality when the base array is split after the first axis generic.filter_suite(x, b) - # Split the BoltArraySpark after the second axis and rerun the tests + # Split the BoltArray after the second axis and rerun the tests b = array(x, sc, axis=(0, 1)) generic.filter_suite(x, b) - # Split the BoltArraySpark after the third axis (scalar values) and rerun the tests + # Split the BoltArray after the third axis (scalar values) and rerun the tests b = array(x, sc, axis=(0, 1, 2)) generic.filter_suite(x, b) diff --git a/test/test_spark_stacking.py b/test/test_spark_stacking.py index 2e146a9..bf70b91 100644 --- a/test/test_spark_stacking.py +++ b/test/test_spark_stacking.py @@ -2,7 +2,7 @@ from numpy import arange, repeat, asarray, vstack, tile from bolt import array, ones from bolt.utils import allclose -from bolt.array.array import BoltArraySpark +from bolt.array.array import BoltArray def _2D_stackable_preamble(sc, num_partitions=2): @@ -10,7 +10,7 @@ def _2D_stackable_preamble(sc, num_partitions=2): dims = (10, 10) arr = vstack([[x]*dims[1] for x in arange(dims[0])]) barr = array(arr, sc, axis=0) - barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), + barr = BoltArray(barr._rdd.partitionBy(num_partitions), shape=barr.shape, split=barr.split) return barr @@ -20,7 +20,7 @@ def _3D_stackable_preamble(sc, num_partitions=2): area = dims[0] * dims[1] arr = asarray([repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) barr = array(arr, sc, axis=0) - barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), + barr = BoltArray(barr._rdd.partitionBy(num_partitions), shape=barr.shape, split=barr.split) return barr From 2fd7ff9a1b0aedbc8764ca7a4b3c3ffa746c210c Mon Sep 17 00:00:00 2001 From: jwittenbach Date: Fri, 6 Jan 2017 15:03:08 -0500 Subject: [PATCH 4/4] updates setup.py --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1cee7a9..c973710 100755 --- a/setup.py +++ b/setup.py @@ -11,9 +11,10 @@ author='The Freeman Lab', author_email='the.freeman.lab@gmail.com', url='https://github.com/bolt-project/bolt', - packages=['bolt', - 'bolt.local', - 'bolt.spark'], + packages=[ + 'bolt', + 'bolt.array' + ], long_description=open('README.rst').read(), install_requires=open('requirements.txt').read().split() )