diff --git a/.gitignore b/.gitignore
index 756d371fc..581f2abe5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 *.a
 *.o
+bin
+build
 lib
diff --git a/.travis.yml b/.travis.yml
index fc911570b..5b8097897 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,52 +1,123 @@
 language:
   - c
 
-dist: trusty
+env:
+  - C_STANDARD=99 CXX_STANDARD=98
 
 matrix:
   include:
-    - compiler: clang-3.6
+    - os: linux
+      dist: trusty
+      compiler: clang-3.6
       addons:
         apt:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty']
           packages: clang-3.6
 
-    - compiler: clang
+    - os: linux
+      dist: trusty
+      compiler: clang
       addons:
         apt:
           sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty']
           packages: clang-3.4
 
-    - compiler: gcc-4.4
+    - os: linux
+      dist: trusty
+      compiler: gcc-4.4
       addons:
         apt:
           sources: ubuntu-toolchain-r-test
           packages: gcc-4.4
 
-    - compiler: gcc-4.7
+    - os: linux
+      dist: trusty
+      compiler: gcc-4.7
       addons:
         apt:
           sources: ubuntu-toolchain-r-test
           packages: gcc-4.7
 
-    - compiler: gcc-4.8
+    - os: linux
+      dist: trusty
+      compiler: gcc-4.8
       addons:
         apt:
           sources: ubuntu-toolchain-r-test
           packages: gcc-4.8
 
-    - compiler: gcc-4.9
+    - os: linux
+      dist: trusty
+      compiler: gcc-4.9
       addons:
         apt:
           sources: ubuntu-toolchain-r-test
           packages: gcc-4.9
 
-    - compiler: gcc-5
+    - os: linux
+      dist: trusty
+      compiler: gcc-5
       addons:
         apt:
           sources: ubuntu-toolchain-r-test
           packages: gcc-5
 
+    - os: linux
+      dist: trusty
+      compiler: gcc-6
+      addons:
+        apt:
+          sources: ubuntu-toolchain-r-test
+          packages: gcc-6
+      env:
+        - C_STANDARD=90
+
+    - os: linux
+      dist: trusty
+      compiler: gcc-6
+      addons:
+        apt:
+          sources: ubuntu-toolchain-r-test
+          packages: gcc-6
+      env:
+        - C_STANDARD=11
+
+    - os: linux
+      dist: trusty
+      compiler: gcc-6
+      addons:
+        apt:
+          sources: ubuntu-toolchain-r-test
+          packages: gcc-6
+      env:
+        - CXX_STANDARD=11
+
+    - os: linux
+      dist: trusty
+      compiler: gcc-6
+      addons:
+        apt:
+          sources: ubuntu-toolchain-r-test
+          packages: gcc-6
+      env:
+        - CXX_STANDARD=14
+
+    - os: linux
+      dist: trusty
+      compiler: gcc-6
+      addons:
+        apt:
+          sources: ubuntu-toolchain-r-test
+          packages: gcc-6
+
+    - os: osx
+      osx_image: xcode7.3
+      compiler: gcc
+
+    - os: osx
+      osx_image: xcode8.3
+      compiler: gcc
+
 script:
   - ./travis.sh
 
diff --git a/API b/API
deleted file mode 100644
index a4a5e2496..000000000
--- a/API
+++ /dev/null
@@ -1,443 +0,0 @@
-OVERVIEW
-
-zfp consists of three distinct components: (1) a set of low-level C codecs
-for compressing and decompressing block subsets of one-, two-, and three-
-dimensional single- and double-precision arrays; (2) a set of corresponding
-C++ compressed array classes that support random access; and (3) a high-level
-C interface for compressing and decompressing entire floating-point arrays.
-
-The compression codecs operate on individual d-dimensional blocks of size
-4^d, e.g. 4 values in 1D, 4x4 = 16 values in 2D, and 4x4x4 = 64 values in
-3D.  The block being compressed need not be stored contiguously but can
-be processed by specifying regular strides in each dimension.  This is
-useful if the block is initially stored uncompressed as part of a larger
-array.
-
-The array classes represent an entire array of floating-point values as
-a collection of compressed blocks, each whose compressed size in number
-of bits is fixed and specified by the user.  The array classes cache
-uncompressed blocks to reduce the number of compression and decompression
-calls.  Whenever an array value is read, the corresponding block is first
-looked up in the cache, and if found the uncompressed value is returned.
-Otherwise the block is first decompressed and stored in the cache.
-Whenever an array element is written (whether actually modified or not),
-a "dirty bit" is set with its cached block to indicate that the block
-must be compressed back to persistent storage when evicted from the cache.
-
-The libzfp C interface is useful for quickly compressing and archiving
-large floating-point arrays of arbitrary dimensions without having to
-understand the technical details of the compression algorithm and codec.
-This library comes with utility functions for specifying the compression
-rate, precision, or accuracy of the compressed data.
-
-All code examples below are for 3D arrays of doubles, but it should be
-clear how to modify the function calls for single precision and for 1D
-or 2D arrays.
-
-
-GENERAL DESIGN AND LIMITATIONS
-
-The zfp API has been designed to facilitate integration with existing
-applications.  After initial array declaration, a zfp array can often
-be used in place of a regular C/C++ array or STL vector, e.g. using flat
-indexing via a[index] or using multidimensional indexing via a(i),
-a(i, j), or a(i, j, k).  There are, however, some important differences.
-For instance, it is not possible to take the address of an array element,
-i.e. constructions like &a[i] and a + i are not allowed.  Moreover, the
-operators [] and () do not return regular C++ references.  Instead, a
-proxy reference class is used (similar to how STL bit vectors are
-implemented).  These proxy references can, however, safely be passed to
-functions and used where regular references can.
-
-zfp does not support special floating-point values like infinities and
-NaNs, although denormalized numbers are handled correctly.  Similarly,
-because the compressor assumes that the array values vary smoothly,
-using finite but large values like HUGE_VAL in place of infinities is
-not advised, as this will introduce large errors in smaller values
-within the same block.  Future extensions will provide support for a
-bit mask to mark the presence of non-values.
-
-The zfp C++ classes are implemented entirely as header files and make
-extensive use of C++ templates to reduce code redundancy.  Most classes
-are wrapped in the 'zfp' namespace.
-
-
-API OVERVIEW
-
-The documentation is divided into three parts: the high-level libzfp
-library; the low-level compression codecs; and the compressed array
-classes (in that order).  Users interested only in the compressed arrays,
-which do not directly expose anything related to compression other than
-compression rate control, may safely skip the next two sections.
-
-
-ZFP HIGH-LEVEL C INTERFACE
-
-Users concerned only with storing their floating-point data compressed may
-use zfp as a black box that maps a possibly non-contiguous floating-point
-array to a compressed bit stream.  The intent of libzfp is to provide both
-a high- and low-level interface to the compressor that can be called from
-both C and C++ (and possibly other languages).  libzfp supports strided
-access, e.g. for compressing vector fields one scalar at a time, or for
-compressing arrays of structs.
-
-Consider compressing the 3D C/C++ array
-
-  // define an uncompressed array
-  double a[nz][ny][nx];
-
-where nx, ny, and nz can be any positive dimensions.  To invoke the libzfp
-compressor, the dimensions and type must first be specified in a zfp_field
-parameter object that encapsulates the type, size, and memory layout of the
-array:
-
-  // allocate metadata for the 3D array a[nz][ny][nx]
-  uint dims = 3;
-  zfp_type type = zfp_type_double;
-  zfp_field* field = zfp_field_3d(&a[0][0][0], type, nx, ny, nz);
-
-For single-precision data, use zfp_type_float.  As of version 0.5.1, the
-the high-level API also supports integer arrays (zfp_type_int32 and
-zfp_type_int64).  See FAQs #8 and #9 regarding integer compression.
-
-Functions similar to zfp_field_3d exist for declaring 1D and 2D arrays.
-If the dimensionality of the array is unknown at this point, then a generic
-zfp_field_alloc() call can be made to just allocate a zfp_field struct,
-which can be filled in later using the zfp_field_set_* functions.  If the
-array is non-contiguous, then zfp_field_set_stride_3d should be called.
-
-The zfp_field parameter object holds information about the uncompressed
-array.  To specify the compressed array, a zfp_stream object must be
-allocated:
-
-  // allocate metadata for a compressed stream
-  zfp_stream* zfp = zfp_stream_open(NULL);
-
-We may now specify the rate, precision, or accuracy (see the README file
-for more details on the meaning of these parameters):
-
-  // set compression mode and parameters
-  zfp_stream_set_rate(zfp, rate, type, dims, 0);
-  zfp_stream_set_precision(zfp, precision);
-  zfp_stream_set_accuracy(zfp, tolerance);
-
-Note that only one of these three functions should be called.  The return
-value from these functions gives the actual rate, precision, or tolerance,
-and may differ slightly from the argument passed due to constraints imposed
-by the compressor, e.g. each block must be stored using a whole number of
-bits at least as large as the number of bits in the floating-point exponent;
-the precision cannot exceed the number of bits in a floating-point value
-(i.e. 32 for single and 64 for double precision); and the tolerance must
-be a (possibly negative) power of two.
-
-The compression parameters have now been specified, but before compression
-can occur a buffer large enough to hold the compressed bit stream must be
-allocated.  Another utility function exists for estimating how many bytes
-are needed:
-
-  // allocate buffer for compressed data
-  size_t bufsize = zfp_stream_maximum_size(zfp, field);
-  uchar* buffer = new uchar[bufsize];
-
-Note that zfp_stream_maximum_size returns the smallest buffer size
-necessary to safely compress the data--the actual compressed size may be
-smaller.  If the members of zfp and field are for whatever reason not
-initialized correctly, then zfp_stream_maximum_size returns 0.
-
-Before compression can commence, we must associate the allocated buffer
-with a bit stream used by the compressor to read and write bits:
-
-  // associate bit stream with allocated buffer
-  bitstream* stream = stream_open(buffer, bufsize);
-  zfp_stream_set_bit_stream(zfp, stream);
-
-Finally, the array is compressed as follows:
-
-  // compress entire array
-  size_t size = zfp_compress(zfp, field);
-
-The return value is the actual number of bytes of compressed storage, and
-as already mentioned, size <= bufsize.  If size = 0, then the compressor
-failed.  Since zfp 0.5.0, the compressor does not rewind the bit stream
-before compressing, which allows multiple fields to be compressed one
-after the other.  The return value from zfp_compress is always the total
-number of bytes of compressed storage so far relative to the memory
-location pointed to by 'buffer'.
-
-To decompress the data, the field and compression parameters must be
-initialized with the same values as used for compression, either via
-the same sequence of function calls as above, or by recording these
-fields and setting them directly.  Metadata such as array dimensions and
-compression parameters are by default not stored in the compressed stream.
-It is up to the caller to store this information, either separately from
-the compressed data, or via the zfp_write_header and zfp_read_header calls,
-which must precede the corresponding zfp_compress and zfp_decompress
-calls, respectively.  These calls allow the user to specify what
-information to store in the header, including a 'magic' format identifier,
-the field type and dimensions, and the compression parameters (see the
-ZFP_HEADER_* macros).
-
-In addition to this initialization, the bit stream has to be rewound to
-the beginning (before reading the header and decompressing the data):
-
-  // rewind compressed stream and decompress array
-  zfp_stream_rewind(zfp);
-  int success = zfp_decompress(zfp, field);
-
-The return value is zero if the decompressor failed.
-
-
-ZFP LOW-LEVEL COMPRESSION AND DECOMPRESSION CODEC
-
-For applications that wish to compress or decompress portions of an array
-on demand, a low-level interface is available.  Since this API is useful
-primarily for supporting random access, the user also needs to manipulate
-the bit stream (see include/bitstream.h), e.g. to position the bit pointer
-to where data is to be read or written.  Please be advised that the bit
-stream functions have been optimized for speed, and do not check for
-buffer overruns or other types of programmer error.
-
-Like the high-level API, the low-level API also makes use of the zfp_stream
-parameter object (see section above) to specify compression parameters and
-storage, but does not encapsulate array metadata in a zfp_field object.
-Functions exist for encoding and decoding complete or partial blocks, with
-or without strided access.  In non-strided mode, the uncompressed block to
-be encoded or decoded is assumed to be stored contiguously.  For example,
-
-  // compress a single contiguous block
-  double block[4 * 4 * 4] = { /* some set of values */ };
-  uint bits = zfp_encode_block_double_3(zfp, block);
-
-The return value is the number of bits of compressed storage for the block.
-For fixed-rate streams, if random access is desired, then the stream should
-also be flushed after each block is encoded:
-
-  // flush any buffered bits
-  zfp_stream_flush(zfp);
-
-This flushing should be done only after the last block has been compressed in
-fixed-precision and fixed-accuracy mode, or when random access is not needed
-in fixed-rate mode.
-
-The block above could also have been compressed as follows using strides:
-
-  // compress a single contiguous block using strides
-  double block[4][4][4] = { /* some set of values */ };
-  int sx = &block[0][0][1] - &block[0][0][0]; // x stride =  1
-  int sy = &block[0][1][0] - &block[0][0][0]; // y stride =  4
-  int sz = &block[1][0][0] - &block[0][0][0]; // z stride = 16
-  uint bits = zfp_encode_block_strided_double_3(zfp, block, sx, sy, sz);
-
-The strides are measured in number of scalars, not in bytes.
-
-For partial blocks, e.g. near the boundaries of arrays whose dimensions
-are not multiples of four, there are corresponding functions that accept
-parameters (nx, ny, nz) to specify the actual block dimensions, with
-1 <= nx, ny, nz <= 4.  Corresponding functions exist for decompression.
-Such partial blocks typically do not compress as well as full blocks and
-should be avoided if possible.
-
-To position a bit stream for reading (decompression), use
-
-  // position the stream at given bit offset for reading
-  stream_rseek(stream, offset);
-
-where the offset is measured in number of bits from the beginning of the
-stream.  For writing (compression), a corresponding call exists:
-
-  // position the stream at given bit offset for writing
-  stream_wseek(stream, offset);
-
-Note that it is possible to decompress fewer bits than are stored with a
-compressed block to quickly obtain an approximation.  This is done by
-setting zfp->maxbits to fewer bits than used during compression, e.g. to
-decompress only the first 256 bits of each block:
-
-  // modify decompression parameters to decode 256 bits per block
-  uint maxbits;
-  uint maxprec;
-  int minexp;
-  zfp_stream_params(zfp, NULL, &maxbits, &maxprec, &minexp);
-  assert(maxbits >= 256);
-  zfp_stream_set_params(zfp, 256, 256, maxprec, minexp);
-
-This feature may be combined with progressive decompression, as discussed
-further in FAQ #13.
-
-
-COMPRESSED ARRAYS
-
-Currently there are six array classes for 1D, 2D, and 3D arrays, each of
-which can represent single- or double-precision values.  Although these
-arrays store values in a form different from conventional single- and
-double-precision floating point, the user interacts with the arrays via
-floats and doubles.
-
-The description below is for 3D arrays of doubles--the necessary changes
-for other array types should be obvious.  To declare and zero initialize
-an array, use
-
-  // declare nx * ny * nz array of compressed doubles
-  zfp::array3<double> a(nx, ny, nz, rate);
-
-This declaration is conceptually equivalent to
-
-  double a[nz][ny][nx] = {};
-
-or
-
-  std::vector<double> a(nx * ny * nz, 0.0);
-
-but with the user specifying the amount of storage used.  (A predefined type
-array3d also exists, while the suffix 'f' is used for floats.)  Note that
-the array dimensions can be arbitrary, and need not be multiples of four
-(see above for a discussion of incomplete blocks).  The 'rate' argument
-specifies how many bits per value (amortized) to store in the compressed
-representation.  By default the block size is restricted to a multiple of
-64 bits, and therefore the rate argument can be specified in increments of
-64 / 4^d bits in d dimensions, i.e.
-
-  1D arrays: 16-bit granularity
-  2D arrays: 4-bit granularity
-  3D arrays: 1-bit granularity
-
-For finer granularity, the BIT_STREAM_WORD_TYPE macro needs to be set to a
-type narrower than 64 bits during compilation of libzfp, e.g. if set to
-uint8 the rate granularity becomes 8 / 4^d bits in d dimensions, or
-
-  1D arrays: 2-bit granularity
-  2D arrays: 1/2-bit granularity
-  3D arrays: 1/8-bit granularity
-
-Note that finer granularity implies lower performance.  Also note that
-because the arrays are stored compressed, their effective precision is
-likely to be higher than the user-specified rate.
-
-The array can also optionally be initialized from an existing contiguous
-floating-point array stored at 'pointer' with an x stride of 1, y stride
-of nx, and z stride of nx * ny:
-
-  // declare and initialize 3D array of doubles
-  zfp::array3d a(nx, ny, nz, precision, pointer, cache_size);
-
-The 'cache_size' argument specifies the minimum number of bytes to allocate
-for the cache of uncompressed blocks (see the section on Caching below for
-more details).
-
-If not already initialized, a function set() can be used to copy uncompressed
-data to the compressed array:
-
-  const double* pointer; // pointer to uncompressed, initialized data
-  a.set(pointer); // initialize compressed array with floating-point data
-
-Similarly, a get() function exists for retrieving uncompressed data:
-
-  double* pointer; // pointer to where to write uncompressed data
-  a.get(pointer); // decompress and store the array at pointer
-
-The compressed representation of an array can also be queried or initialized 
-directly without having to convert to/from its floating-point representation:
-
-  size_t bytes = compressed_size(); // number of bytes of compressed storage
-  uchar* compressed_data(); // pointer to compressed data
-
-The array can through this pointer be initialized from offline compressed
-storage, but only after its dimensions and rate have been specified (see
-above).  For this to work properly, the cache must first be emptied via a
-clear_cache() call (see below).
-
-Through operator overloading, the array can be accessed in one of two ways.
-For read accesses, use
-
-  double value = a[index]; // fetch value with given flat array index
-  double value = a(i, j, k); // fetch value with 3D index (i, j, k)
-
-These access the same value if and only if index = i + nx * (j + ny * k).
-Note that 0 <= i < nx, 0 <= j < ny, and 0 <= k < nz, and i varies faster
-than j, which varies faster than k.
-
-Array values may be written and updated using the usual set of C++ assignment
-and compound assignment operators.  For example:
-
-  a[index] = value; // set value at flat array index
-  a(i, j, k) += value; // increment value with 3D index (i, j, k)
-
-Whereas one might expect these operators to return a (non-const) reference
-to an array element, this would allow seating a reference to a value that
-currently is cached but is transient, which could be unsafe.  Moreover,
-this would preclude detecting when an array element is modified.  Therefore,
-the return type of both operators [] and () is a proxy reference class,
-similar to std::vector<bool>::reference from the STL library.  Because
-read accesses to a mutable object cannot call the const-qualified accessor,
-a proxy reference may be returned even for read calls, e.g. in
-
-  a[i - 1] = a[i];
-
-the array a clearly must be mutable to allow assignment to a[i - 1], and
-therefore the read access a[i] returns type zfp::array3d::reference.  The
-value associated with the read access is obtained via an implicit conversion.
-
-Array dimensions (nx, ny, nz) can be queried using these functions:
-
-  size_t size(); // total number of elements nx * ny * nz
-  uint size_x(); // nx
-  uint size_y(); // ny
-  uint size_z(); // nz
-
-The array dimensions can also be changed dynamically, e.g. if not known
-at time of construction, using
-
-  void resize(uint nx, uint ny, uint nz, bool clear = true);
-
-When clear = true, the array is explicitly zeroed.  In either case, all
-previous contents of the array are lost.  If nx = ny = nz = 0, all storage
-is freed.
-
-Finally, the rate supported by the array may be queried via
-
-  double rate(); // number of compressed bits per value
-
-and changed using
-
-  void set_rate(rate); // change rate
-
-This also destroys prior contents.
-
-
-CACHING
-
-As mentioned above, the array class maintains a software write-back cache
-of at least one uncompressed block.  When a block in this cache is evicted
-(e.g. due to a conflict), it is compressed back to permanent storage only
-if it was modified while stored in the cache.
-
-The size cache to use is specified by the user, and is an important
-parameter that needs careful consideration in order to balance the extra
-memory usage, performance, and quality (recall that data loss is incurred
-only when a block is evicted from the cache and compressed).  Although the
-best choice varies from one application to another, we suggest allocating
-at least two layers of blocks (2 * (nx / 4) * (ny / 4) blocks) for
-applications that stream through the array and perform stencil computations
-such as gathering data from neighboring elements.  This allows limiting the
-cache misses to compulsory ones.  If the cache_size parameter is set to
-zero bytes, then this default of two layers is used.
-
-The cache size can be set during construction, or can be set at a later
-time via
-
-  void set_cache_size(bytes); // change cache size
-
-Note that if bytes = 0, then the array dimensions must have already been
-specified for the default size to be computed correctly.  When the cache
-is resized, it is first flushed if not already empty.  The cache can
-also be flushed explicitly if desired by calling
-
-  void flush_cache(); // empty cache by first compressing any modified blocks
-
-To empty the cache without compressing any cached data, call
-
-  void clear_cache(); // empty cache without compression
-
-To query the byte size of the cache, use
-
-  size_t cache_size(); // actual cache size in bytes
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06668fc8f..1660635b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_MAJOR[ \t]+([0-9]+).*"
      "\\1" ZFP_VERSION_MAJOR ${_zfp_h_contents})
 string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_MINOR[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_MINOR ${_zfp_h_contents})
-string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_RELEASE[ \t]+([0-9]+).*"
+string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_PATCH[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_PATCH ${_zfp_h_contents})
 set(ZFP_VERSION
   "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}")
@@ -46,8 +46,8 @@ endif()
 # Top level options
 #------------------------------------------------------------------------------#
 
-# Windows specific options
-if(WIN32)
+# Windows (Visual Studio) specific options
+if(MSVC)
   # Use this to get a usable export library when building a DLL on Windows
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
@@ -56,8 +56,21 @@ if(WIN32)
 endif()
 
 # Suggest C99
-# Falls back to C89 if 99 is not supported
-set(CMAKE_C_STANDARD 99)
+if(NOT CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 99)
+endif()
+
+if(MSVC OR MINGW)
+  set(CMAKE_C_STANDARD 90)
+endif()
+
+message(STATUS "Compiling with C standard: ${CMAKE_C_STANDARD}")
+
+# Suggest C++98
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 98)
+endif()
+message(STATUS "Compiling with C++ standard: ${CMAKE_CXX_STANDARD}")
 
 include(CMakeDependentOption)
 
@@ -74,7 +87,7 @@ cmake_dependent_option(BUILD_SHARED_LIBS
   "SHARED_LIBS_SUPPORTED" OFF)
 
 # PIC is always on for shared libs.  This allows it to be selectable for
-# static libs
+# static libs.
 if(SHARED_LIBS_SUPPORTED)
   cmake_dependent_option(ZFP_ENABLE_PIC
     "Build with Position Independent Code" ON
@@ -95,10 +108,46 @@ option(ZFP_WITH_ALIGNED_ALLOC "Enabled aligned memory allocation" OFF)
 
 option(ZFP_WITH_CACHE_TWOWAY "Use two-way skew-associative cache" OFF)
 
-option(ZFP_WITH_CACHE_FASH_HASH
+option(ZFP_WITH_CACHE_FAST_HASH
   "Use a faster but more collision prone hash function" OFF)
 
-option(ZFP_WITH_CACHE_PROFILING "Count cache misses" OFF)
+option(ZFP_WITH_CACHE_PROFILE "Count cache misses" OFF)
+
+# Handle compile-time macros
+
+if((DEFINED ZFP_INT64) AND (DEFINED ZFP_INT64_SUFFIX))
+  list(APPEND zfp_defs ZFP_INT64=${ZFP_INT64})
+  list(APPEND zfp_defs ZFP_INT64_SUFFIX=${ZFP_INT64_SUFFIX})
+endif()
+
+if((DEFINED ZFP_UINT64) AND (DEFINED ZFP_UINT64_SUFFIX))
+  list(APPEND zfp_defs ZFP_UINT64=${ZFP_UINT64})
+  list(APPEND zfp_defs ZFP_UINT64_SUFFIX=${ZFP_UINT64_SUFFIX})
+endif()
+
+if(NOT (ZFP_BIT_STREAM_WORD_SIZE EQUAL 64))
+  list(APPEND zfp_defs BIT_STREAM_WORD_TYPE=uint${ZFP_BIT_STREAM_WORD_SIZE})
+endif()
+
+if(ZFP_WITH_BIT_STREAM_STRIDED)
+  list(APPEND zfp_defs BIT_STREAM_STRIDED)
+endif()
+
+if(ZFP_WITH_ALIGNED_ALLOC)
+  list(APPEND zfp_defs ZFP_ALIGNED_ALLOC)
+endif()
+
+if(ZFP_WITH_CACHE_TWOWAY)
+  list(APPEND zfp_defs ZFP_CACHE_TWOWAY)
+endif()
+
+if(ZFP_WITH_CACHE_FAST_HASH)
+  list(APPEND zfp_defs ZFP_CACHE_FAST_HASH)
+endif()
+
+if(ZFP_WITH_CACHE_PROFILE)
+  list(APPEND zfp_defs ZFP_CACHE_PROFILE)
+endif()
 
 #------------------------------------------------------------------------------#
 # Add source code
diff --git a/Config b/Config
index 1acb27962..c7154c5b4 100644
--- a/Config
+++ b/Config
@@ -2,13 +2,18 @@
 
 CC = gcc
 CXX = g++
-FLAGS = -O3 -fPIC -Wall -I../include $(DEFS)
+FLAGS = -O3 -fPIC -Wall -pedantic -I../include $(DEFS)
 # CFLAGS = -std=c89 -Wno-unused-function $(FLAGS)
 CFLAGS = -std=c99 $(FLAGS)
 CXXFLAGS = -std=c++98 $(FLAGS)
+# CXXFLAGS = -std=c++11 $(FLAGS)
 
 # optional compiler macros ----------------------------------------------------
 
+# use long long for 64-bit types
+# DEFS += -DZFP_INT64='long long' -DZFP_INT64_SUFFIX='ll'
+# DEFS += -DZFP_UINT64='unsigned long long' -DZFP_UINT64_SUFFIX='ull'
+
 # use smaller bit stream word type for finer rate granularity
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint8
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint16
@@ -19,13 +24,13 @@ CXXFLAGS = -std=c++98 $(FLAGS)
 # DEFS += -DBIT_STREAM_STRIDED
 
 # use aligned memory allocation
-# DEFS += -DALIGNED_ALLOC
+# DEFS += -DZFP_WITH_ALIGNED_ALLOC
 
 # use two-way skew-associative cache
-# DEFS += -DCACHE_TWOWAY
+# DEFS += -DZFP_WITH_CACHE_TWOWAY
 
 # use faster but more collision prone hash function
-# DEFS += -DCACHE_FAST_HASH
+# DEFS += -DZFP_WITH_CACHE_FAST_HASH
 
 # count cache misses
-# DEFS += -DCACHE_PROFILE
+# DEFS += -DZFP_WITH_CACHE_PROFILE
diff --git a/FAQ b/FAQ
deleted file mode 100644
index 658a1babf..000000000
--- a/FAQ
+++ /dev/null
@@ -1,624 +0,0 @@
-[NEED TO PROOFREAD FAQ]
-
-The following is a list of answers to frequently asked questions.  For
-questions not answered here or elsewhere in the documentation, please
-e-mail the author at pl@llnl.gov.
-
-Questions answered in this FAQ:
-
- Q1: Can zfp compress vector fields?
- Q2: Should I declare a 2D array as zfp::array1d a(nx * ny, rate)?
- Q3: How can I initialize a zfp compressed array from disk?
- Q4: Can I use zfp to represent dense matrices?
- Q5: Can zfp compress logically regular but geometrically irregular data?
- Q6: Does zfp handle infinities, NaNs, and denormal floating-point numbers?
- Q7: Can zfp handle data with some missing values?
- Q8: Can I use zfp to store integer data?
- Q9: Can I compress 32-bit integers using zfp?
-Q10: Why does zfp corrupt memory if my allocated buffer is too small?
-Q11: Are zfp compressed streams portable across platforms?
-Q12: How can I achieve finer rate granularity?
-Q13: Can I generate progressive zfp streams?
-Q14: How do I initialize the decompressor?
-Q15: Must I use the same parameters during compression and decompression?
-Q16: Do strides have to match during compression and decompression?
-Q17: Why does zfp sometimes not respect my error tolerance?
-Q18. Why is the actual rate sometimes not what I requested?
-Q19. Can zfp perform compression in place?
-Q20. How should I set the precision to bound the relative error?
-Q21. Does zfp support lossless compression?
-
--------------------------------------------------------------------------------
-
-Q1: I have a 2D vector field
-
-  double velocity[ny][nx][2];
-
-of dimensions nx * ny.  Can I use a 3D zfp array to store this as
-
-  array3d velocity(2, nx, ny, rate);
-
-A: Although this could be done, zfp assumes that consecutive values are
-related.  The two velocity components (vx, vy) are almost suredly independent,
-and would not exhibit smoothness.  This will severely hurt the compression
-rate or quality.  Instead, consider storing vx and vy as two separate 2D
-scalar arrays
-
-  array2d vx(nx, ny, rate);
-  array2d vy(nx, ny, rate);
-
-or as
-
-  array2d velocity[2] = {array2d(nx, ny, rate), array2d(nx, ny, rate)};
-
--------------------------------------------------------------------------------
-
-Q2: I have a 2D scalar field of dimensions nx * ny that I allocate as
-
-  double* a = new double[nx * ny];
-
-and index as
-
-  a[x + nx * y]
-
-Should I use a corresponding zfp array
-
-  array1d a(nx * ny, rate);
-
-to store my data in compressed form?
-
-A: Although this is certainly possible, if the scalar field exhibits
-coherence in both spatial dimensions, then far better results can be
-achieved by using a 2D array
-
-  array2d a(nx, ny, rate);
-
-Although both compressed arrays can be indexed as above, the 2D array can
-exploit smoothness in both dimensions and improve the quality dramatically
-for the same rate.
-
--------------------------------------------------------------------------------
-
-Q3: I have a large, uncompressed, 3D data set
-
-  double a[nz][ny][nx];
-
-stored on disk that I would like to read into a compressed array.  This data
-set will not fit in memory uncompressed.  What is the best way of doing this?
-
-A: Using a zfp array
-
-  array3d a(nx, ny, nz, rate);
-
-the most straightforward way is to read one floating-point value at a time
-and copy it into the array:
-
-  for (uint z = 0; z < nz; z++)
-    for (uint y = 0; y < ny; y++)
-      for (uint x = 0; x < nx; x++) {
-        double f;
-        if (fread(&f, sizeof(f), 1, file) == 1)
-          a(x, y, z) = f;
-        else
-          // handle I/O error
-      }
-
-Note, however, that if the array cache is not large enough, then this may
-compress blocks before they have been completely filled.  Therefore it is
-recommended that the cache holds at least one complete layer of blocks,
-i.e. (nx / 4) * (ny / 4) blocks in the example above.
-
-To avoid inadvertent evictions of partially initialized blocks, it is better
-to buffer four layers of nx * ny values each at a time, when practical, and
-completely initialize one block after another.
-
--------------------------------------------------------------------------------
-
-Q4: Can I use zfp to represent dense matrices?
-
-A: Yes, but your mileage may vary.  Dense matrices, unlike smooth scalar
-fields, rarely exhibit correlation between adjacent rows and columns.  Thus,
-the quality or compression ratio may suffer.
-
--------------------------------------------------------------------------------
-
-Q5: My data is logically structured but irregularly sampled, e.g. it is
-rectilinear, curvilinear, or Lagrangian, or uses an irregular spacing of
-quadrature points.  Can I still use zfp to compress it?
-
-A: Yes, as long as the data is (or can be) represented as a logical
-multidimensional array, though your mileage may vary.  zfp has been designed
-for uniformly sampled data, and compression will in general suffer the more
-irregular the sampling is.
-
--------------------------------------------------------------------------------
-
-Q6: Does zfp handle infinities and NaNs?  What about denormal floating-point
-numbers?
-
-A: No, only finite, valid floating-point values are supported.  If a block
-contains a NaN or an infinity, undefined behavior is invoked due to the
-C math function frexp being undefined for non-numbers.  Denorms are, however,
-handled correctly.
-
--------------------------------------------------------------------------------
-
-Q7: My data has some missing values that are flagged by very large numbers,
-e.g. 1e30.  Is that OK?
-
-A: Although all finite numbers are "correctly" handled, such large sentinel
-values are likely to pollute nearby values, because all values within a block
-are expressed with respect to a common largest exponent.  The presence of
-very large values may result in complete loss of precision of nearby, valid
-numbers.  Currently no solution to this problem is available, but future
-versions of zfp will likely support a bit mask to tag values that should be
-excluded from compression.
-
--------------------------------------------------------------------------------
-
-Q8: Can I use zfp to store integer data such as 8-bit quantized images or
-16-bit digital elevation models?
-
-A: Yes (as of version 0.4.0), but the data has to be promoted to 32-bit signed
-integers first.  This should be done one block at a time using an appropriate
-zfp_promote_*_to_int32 function call (see zfp.h).  Note that these functions
-shift the low-precision integers into the most significant bits of 31-bit (not
-32-bit) integers and also convert unsigned to signed integers.  Do use these
-functions rather than simply casting 8-bit integers to 32 bits to avoid wasting
-compressed bits to encode leading zeros.  Moreover, in fixed-precision mode,
-set the precision relative to the precision of the (unpromoted) source data.
-
-As of version 0.5.1, integer data is supported both by the low-level API and
-high-level calls zfp_compress and zfp_decompress.
-
--------------------------------------------------------------------------------
-
-Q9: I have some 32-bit integer data.  Can I compress it using zfp's 32-bit
-integer support?
-
-A: Maybe.  zfp compression of 32-bit and 64-bit integers requires that each
-integer f have magnitude |f| < 2^30 and |f| < 2^62, respectively.  To handle
-signed integers that span the entire range -2^31 <= x < 2^31, or unsigned
-integers 0 <= x < 2^32, the data has to be promoted to 64 bits first.
-
-As with floating-point data, the integers should ideally represent a
-quantized continuous function rather than, say, categorical data or set of
-indices.  Depending on compression settings and data range, the integers may
-or may not be losslessly compressed.  If fixed-precision mode is used, the
-integers may be stored at less precision than requested.  See Q21 for more
-details on precision and lossless compression.
-
--------------------------------------------------------------------------------
-
-Q10: Why does zfp corrupt memory rather than return an error code if not enough
-memory is allocated for the compressed data?
-
-A: This is for performance reasons.  zfp was primarily designed for fast
-random access to fixed-rate compressed arrays, where checking for buffer
-overruns is unnecessary.  Adding a test for every compressed byte output
-would significantly compromise performance.
-
-One way around this problem (when not in fixed-rate mode) is to use the
-maxbits parameter in conjunction with the maximum precision or maximum
-absolute error parameters to limit the size of compressed blocks.  Finally,
-the function zfp_stream_maximum_size returns a conservative buffer size
-that is guaranteed to be large enough to hold the compressed data and the
-optional header.
-
--------------------------------------------------------------------------------
-
-Q11: Are zfp compressed streams portable across platforms?  Are there, for
-example, endianness issues?
-
-A: Yes, zfp can write portable compressed streams.  To ensure portability
-across different endian platforms, the bit stream must however be written
-in increments of single bytes on big endian processors (e.g. PowerPC, SPARC),
-which is achieved by compiling zfp with an 8-bit (single-byte) word size:
-
-  -DBIT_STREAM_WORD_TYPE=uint8
-
-See the Config file.  Note that on little endian processors (e.g. Intel
-x86-64 and AMD64), the word size does not affect the bit stream produced,
-and thus the default word size may be used.  By default, zfp uses a word
-size of 64 bits, which results in the coarsest rate granularity but fastest
-(de)compression.  If cross-platform portability is not needed, then the
-maximum word size is recommended (but see also Q12).
-
-When using 8-bit words, zfp produces a compressed stream that is byte order
-independent, i.e. the exact same compressed sequence of bytes is generated
-on little and big endian platforms.  When decompressing such streams,
-floating-point and integer values are recovered in the native byte order of
-the machine performing decompression.  The decompressed values can be used
-immediately without the need for byte swapping and without having to worry
-about the byte order of the computer that generated the compressed stream.
-
-Finally, zfp assumes that the floating-point format conforms to IEEE 754.
-Issues may arise on architectures that do not support IEEE floating point.
-
--------------------------------------------------------------------------------
-
-Q12: How can I achieve finer rate granularity?
-
-A: For d-dimensional arrays, zfp supports a granularity of 8 / 4^d bits, i.e.
-the rate can be specified in increments of a fraction of a bit for 2D and 3D
-arrays.  Such fine rate selection is always available for sequential
-compression (e.g. when calling zfp_compress).  
-
-Unlike in sequential compression, zfp's compressed arrays require random
-access writes, which are supported only at the granularity of whole words.
-By default, a word is 64 bits, which gives a rate granularity of 64 / 4^d
-in d dimensions, i.e. 16 bits in 1D, 4 bits in 2D, and 1 bit in 3D.
-
-To achieve finer granularity, recompile zfp with a smaller (but as large as
-possible) stream word size, e.g.
-
-  -DBIT_STREAM_WORD_TYPE=uint8
-
-gives the finest possible granularity, but at the expense of (de)compression
-speed.  See the Config file.
-
--------------------------------------------------------------------------------
-
-Q13: Can I generate progressive zfp streams?
-
-A: Yes, but it requires some coding effort.  There is no high-level support
-for progressive zfp streams.  To implement progressive fixed-rate streams, 
-the fixed-length bit streams should be interleaved among the blocks that
-make up an array.  For instance, if a 3D array uses 1024 bits per block,
-then those 1024 bits could be broken down into, say, 16 pieces of 64 bits
-each, resulting in 16 discrete quality settings.  By storing the blocks
-interleaved such that the first 64 bits of all blocks are contiguous,
-followed by the next 64 bits of all blocks, etc., one can achieve progressive
-decompression by setting the maxbits parameter (see zfp_stream_set_params)
-to the number of bits per block received so far.
-
-To enable interleaving of blocks, zfp must first be compiled with
-
-  -DBIT_STREAM_STRIDED
-
-to enable strided bit stream access.  In the example above, if the stream
-word size is 64 bits and there are n blocks, then
-
-  stream_set_stride(stream, m, n);
-
-implies that after every m 64-bit words have been decoded, the bit stream
-is advanced by m * n words to the next set of m 64-bit words associated
-with the block.
-
--------------------------------------------------------------------------------
-
-Q14: How do I initialize the decompressor?
-
-A: The zfp_stream and zfp_field objects usually need to be initialized with
-the same values as they had during compression (but see Q15 for exceptions).
-These objects hold the compression mode and parameters, and field data like
-the scalar type and dimensions.  By default, these parameters are not stored
-with the compressed stream (the "codestream") and prior to zfp 0.5.0 had to
-be maintained separately by the application.
-
-Since version 0.5.0, functions exist for reading and writing a 12- to 19-byte
-header that encodes compression and field parameters.  For applications that
-wish to embed only the compression parameters, e.g. when the field dimensions
-are already known, there are separate functions that encode and decode this
-information independently.
-
--------------------------------------------------------------------------------
-
-Q15: Must I use the same parameters during compression and decompression?
-
-A: Not necessarily.  It is possible to use more tightly constrained zfp_stream
-parameters during decompression than were used during compression.  For
-instance, one may use a larger minbits, smaller maxbits, smaller maxprec, or
-larger minexp during decompression to process fewer compressed bits than are
-stored, and to decompress the array more quickly at a lower precision.  This
-may be useful in situations where the precision and accuracy requirements are
-not known a priori, thus forcing conservative settings during compression, or
-when the compressed stream is used for multiple purposes.  For instance,
-visualization usually has less stringent precision requirements than
-quantitative data analysis.  This feature of decompressing to a lower precision
-is particularly useful when the stream is stored progressively (see Q13).
-
-Note that one may not use less constrained parameters during decompression,
-e.g. one cannot ask for more than maxprec bits of precision when decompressing.
-
-Currently float arrays have a different compressed representation from
-compressed double arrays due to differences in exponent width.  It is not
-possible to compress a double array and then decompress (demote) the result
-to floats, for instance.  Future versions of the zfp codec may use a unified
-representation that does allow this.
-
--------------------------------------------------------------------------------
-
-Q16: Do strides have to match during compression and decompression?
-
-A: No.  For instance, a 2D vector field
-
-  float in[ny][nx][2];
-
-could be compressed as two scalar fields with strides sx = 2, sy = 2 * nx,
-and with pointers &in[0][0][0] and &in[0][0][1] to the first value of each
-scalar field.  These two scalar fields can later be decompressed as
-non-interleaved fields
-
-  float out[2][ny][nx];
-
-using strides sx = 1, sy = nx and pointers &out[0][0][0] and &out[1][0][0].
-
--------------------------------------------------------------------------------
-
-Q17: Why does zfp sometimes not respect my error tolerance?
-
-A: zfp does not store each floating-point value independently, but represents
-a group of values (4, 16, or 64 values, depending on dimensionality) as linear
-combinations like averages by evaluating arithmetic expressions.  Just like in
-uncompressed IEEE floating-point arithmetic, both representation error and
-roundoff error in the least significant bit(s) often occur.
-
-To illustrate this, consider compressing the following 1D array of four floats
-
-  float f[4] = { 1, 1e-1, 1e-2, 1e-3 };
-
-using the zfp command-line tool:
-
-  zfp -f -1 4 -a 0 -i input.dat -o output.dat
-
-In spite of an error tolerance of zero, the reconstructed values are
-
-  float g[4] = { 1, 1e-1, 9.999998e-03, 9.999946e-04 };
-
-with a (computed) maximum error of 5.472e-9.  Because f[3] = 1e-3 can only
-be approximately represented in radix-2 floating-point, the actual error
-is even smaller: 5.424e-9.  This reconstruction error is primarily due to zfp's
-block-floating-point representation, which expresses the four values relative
-to a single, common binary exponent.  Such exponent alignment occurs also in
-regular IEEE floating-point operations like addition.  For instance,
-
-  float x = (f[0] + f[3]) - 1;
-
-should of course result in x = f[3] = 1e-3, but due to exponent alignment
-a few of the least significant bits of f[3] are lost in the addition, giving
-a result of x = 1.0000467e-3 and a roundoff error of 4.668e-8.  Similarly,
-
-  float sum = f[0] + f[1] + f[2] + f[3];
-
-should return sum = 1.111, but is computed as 1.1110000610.  Moreover, the
-value 1.111 cannot even be represented exactly in (radix-2) floating-point;
-the closest float is 1.1109999.  Thus the computed error
-
-  float error = sum - 1.111f;
-
-which itself has some roundoff error, is 1.192e-7.
-
-Phew!  Note how the error introduced by zfp (5.472e-9) is in fact one to two
-orders of magnitude smaller than the roundoff errors (4.668e-8 and 1.192e-7)
-introduced by IEEE floating-point in these computations.  This lower error
-is in part due to zfp's use of 30-bit significands compared to IEEE's 24-bit
-single-precision significands.  Note that data sets with a large dynamic
-range, e.g. where adjacent values differ a lot in magnitude, are more
-susceptible to representation errors.
-
-The moral of the story is that error tolerances smaller than machine epsilon
-(relative to the data range) cannot always be satisfied by zfp.  Nor are such
-tolerances necessarily meaningful for representing floating-point data that
-originated in floating-point arithmetic expressions, since accumulated
-roundoff errors are likely to swamp compression errors.  Because such roundoff
-errors occur frequently in floating-point arithmetic, insisting on lossless
-compression on the grounds of accuracy is tenuous at best.
-
--------------------------------------------------------------------------------
-
-Q18. Why is the actual rate sometimes not what I requested?
-
-A: In principle, zfp allows specifying the size of a compressed block in
-increments of single bits, thus allowing very fine-grained tuning of the
-bit rate.  There are, however, cases when the desired rate does not exactly
-agree with the effective rate, and users are encouraged to check the return
-value of zfp_stream_set_rate, which gives the actual rate.
-
-There are several reasons why the requested rate may not be honored.  First,
-the rate is specified in bits/value, while zfp always represents a block
-of 4^d values in d dimensions, i.e. using N = 4^d * rate bits.  N must be
-an integer number of bits, which constrains the actual rate to be a multiple
-of 1 / 4^d.  The actual rate is computed by rounding 4^d times the desired
-rate.
-
-Second, if the array dimensions are not multiples of four, then zfp pads the
-dimensions to the next higher multiple of four.  Thus, the total number of
-bits for a 2D array of dimensions nx * ny is computed in terms of the number
-of blocks bx * by:
-
-  bitsize = (4 * bx) * (4 * by) * rate
-
-where nx <= 4 * bx < nx + 4 and ny <= 4 * by < ny + 4.  When amortizing
-bitsize over the nx * ny values, a slightly higher rate than requested may
-result.
-
-Third, to support updating compressed blocks, as is needed by zfp's
-compressed array classes, the user may request write random access to the
-fixed-rate stream.  To support this, each block must be aligned on a stream
-word boundary (see Q12), and therefore the rate when write random access
-is requested must be a multiple of wordsize / 4^d bits.  By default
-wordsize = 64 bits.
-
-Fourth, for floating-point data, each block must hold at least the common
-exponent and one additional bit, which places a lower bound on the rate.
-
-Finally, the user may optionally include a header with each array.  Although
-the header is small, it must be accounted for in the rate.  The function
-zfp_stream_maximum_size conservatively includes space for a header, for
-instance.
-
--------------------------------------------------------------------------------
-
-Q19. Can zfp perform compression in place?
-
-Because the compressed data tends to be far smaller than the uncompressed
-data, it is natural to ask if the compressed stream can overwrite the
-uncompressed array to avoid having to allocate separate storage for the
-compressed stream.  zfp does allow for the possibility of such in-place
-compression, but with several caveats and restrictions:
-
-1. A bitstream must be created whose buffer points to the beginning of
-uncompressed (and to be compressed) storage.
-
-2. The array must be compressed using zfp's low-level API.  In particular,
-the data must already be partitioned and organized into contiguous blocks
-so that all values of a block can be pulled out once and then replaced
-with the corresponding shorter compressed representation.
-
-3. No one compressed block can occupy more space than its corresponding
-uncompressed block so that the not-yet compressed data is not overwritten.
-This is usually easily accomplished in fixed-rate mode, although the expert
-interface also allows guarding against this in all modes using the maxbits
-parameter.  This parameter should be set to maxbits = 4^d * 8 * sizeof(type),
-where d is the array dimensionality (1, 2, or 3) and where 'type' is the
-scalar type of the uncompressed data.
-
-4. No header information may be stored in the compressed stream.
-
-In-place decompression can also be achieved, but in addition to the above
-constraints requires even more care:
-
-1. The data must be decompressed in reverse block order, so that the last
-block is decompressed first to the end of the block array.  This requires
-the user to maintain a pointer to uncompressed storage and to seek via
-stream_rseek to the proper location in the compressed stream where the
-block is stored.
-
-2. The space allocated to the compressed stream must be large enough to
-also hold the uncompressed data.
-
-An example is provided that shows how in-place compression can be done.
-
--------------------------------------------------------------------------------
-
-Q20. How should I set the precision to bound the relative error?
-
-In general, zfp cannot bound the point-wise relative error due to its use of
-a block-floating-point representation, in which all values within a block
-are represented in relation to a single common exponent.  For a high enough
-dynamic range within a block there may simply not be enough precision
-available to guard against loss.  For instance, a block containing the values
-2^0 = 1 and 2^-n would require a precision of n + 3 bits to represent
-losslessly, and zfp uses at most 64-bit integers to represent values.  Thus,
-if n >= 62, then 2^-n is replaced with 0, which is a 100% relative error.
-Note that such loss also occurs when, for instance, 2^0 and 2^-n are added
-using floating-point arithmetic (see also Q17).
-
-It is, however, possible to bound the error relative to the largest (in
-magnitude) value, fmax, within a block, which if the magnitude of values
-does not change too rapidly may serve as a reasonable proxy for point-wise
-relative errors.
-
-One might then ask if using zfp's fixed-precision mode with p bits of
-precision ensures that the block-wise relative error is at most 2^-p * fmax.
-This is, unfortunately, not the case, because the requested precision, p,
-is ensured only for the transform coefficients.  During the inverse transform
-of these quantized coefficients the quantization error may amplify.  That
-being said, it is possible to derive a bound on the error in terms of p that
-would allow choosing an appropriate precision.  Such a bound is derived below.
-
-Let
-
-  emax = floor(log2(fmax))
-
-be the largest base-2 exponent within a block.  For transform coefficient
-precision, p, one can show that the maximum absolute error, err, is bounded by
-
-  err <= k(d) * (2^emax / 2^p) <= k(d) * (fmax / 2^p)
-
-Here k(d) is a constant that depends on the data dimensionality d:
-
-  k(d) = 20 * (15/4)^(d-1)
-
-so that in 1D, 2D, and 3D we have
-
-  k(1) = 20
-  k(2) = 125
-  k(3) = 1125/4
-
-Thus, to guarantee n bits of accuracy in the decompressed data, we need
-to choose a higher precision, p, for the transform coefficients:
-
-  p(n, d) = n + ceil(log2(k(d))) = n + 2 * d + 3
-
-so that
-
-  p(n, 1) = n + 5
-  p(n, 2) = n + 7
-  p(n, 3) = n + 9
-
-This p value should be used in the call to zfp_stream_set_precision.
-
-Note, again, that some values in the block may have leading zeros when
-expressed relative to 2^emax, and these leading zeros are counted toward
-the n-bit precision.  Using decimal to illustrate this, suppose we used
-4-digit precision for a 1D block containing these four values:
-
-  -1.41421e+1 ~ -1.414e+1 = -1414 * (10^1 / 1000)
-  +2.71828e-1 ~ +0.027e+1 =   +27 * (10^1 / 1000)
-  +3.14159e-6 ~ +0.000e+1 =     0 * (10^1 / 1000)
-  +1.00000e+0 ~ +0.100e+1 =  +100 * (10^1 / 1000)
-
-with the values in the middle column aligned to the common base-10 exponent
-+1, and with the values on the right expressed as scaled integers.  These
-are all represented using four digits of precision, but some of those
-digits are leading zeros.
-
--------------------------------------------------------------------------------
-
-Q21. Does zfp support lossless compression?
-
-Yes, and no.  For integer data, zfp can with few exceptions ensure lossless
-compression.  For a given n-bit integer type (n = 32 or n = 64), consider
-compressing p-bit signed integer data, with the sign bit counting toward the
-precision.  In other words, there are exactly 2^p possible signed integers.
-If the integers are unsigned, then subtract 2^(p-1) first so that they range
-from -2^(p-1) to 2^(p-1) - 1.
-
-Lossless compression is achieved by first promoting the p-bit integers to
-n - 1 bits (see Q8) such that all integer values fall in [-2^30, +2^30),
-when n = 32, or in [-2^62, +2^62), when n = 64.  In other words, the p-bit
-integers first need to be shifted left by n - p - 1 bits.  After promotion,
-the data should be compressed in zfp's fixed-precision mode using
-
-  q = p + 4 * d + 1
-
-bits of precision to ensure no loss, where d is the data dimensionality
-(1 <= d <= 3).  Consequently, the p-bit data can be losslessly compressed
-as long as p <= n - 4 * d - 1.  The table below lists the maximum precision
-p that can be losslessly compressed using 32- and 64-bit integer types.
-
-  d n=32 n=64
-  1  27   59
-  2  23   55
-  3  19   51
-
-Although lossless compression is possible as long as the precision constraint
-is met, the precision needed to guarantee no loss is generally much higher
-than the precision intrinsic in the uncompressed data, making lossless
-compression via zfp not competitive with compressors designed for lossless
-compression.  Lossy integer compression with zfp can, on the other hand, work
-fairly well by using fewer than q bits of precision.
-
-Furthermore, the minimum precision, q, given above is often larger than what
-is necessary in practice.  There are worst-case inputs that do require such
-large q values, but they are quite rare.
-
-The reason for expanded precision, i.e., why q > p, is that zfp's decorrelating
-transform computes averages of integers, and this transform is applied d times
-in d dimensions.  Each average of two p-bit numbers requires p + 1 bits to
-avoid loss, and each transform can be thought of involving up to four such
-averaging operations.
-
-For floating-point data, fully lossless compression with zfp is unlikely,
-albeit possible.  If the dynamic range is low or varies slowly such that values
-within a 4^d block have the same or similar exponent, then the precision gained
-by discarding the 8 or 11 bis of the common floating-point exponents can offset
-the precision lost in the decorrelating transform.  For instance, if all values
-in a block have the same exponent, then lossless compression is obtained using
-q = 26 + 4 * d <= 32 bits of precision for single-precision data and
-q = 55 + 4 * d <= 64 bits of precision for double-precision data.  Of course,
-the constraint imposed by the available integer precision n implies that
-lossless compression of such data is possible only in 1D for single-precision
-data and only in 1D and 2D for double-precision data.
diff --git a/ISSUES b/ISSUES
deleted file mode 100644
index 28f11cfbb..000000000
--- a/ISSUES
+++ /dev/null
@@ -1,332 +0,0 @@
-This document is intended for trouble shooting problems with zfp, in case
-any arise, and primarily focuses on how to correctly make use of zfp.  If
-the decompressed data looks nothing like the original data, or if the
-compression ratios obtained seem not so impressive, then it is very likely
-that array dimensions or compression parameters have not been set correctly,
-in which case this trouble shooting guide could help.
-
-The problems addressed by this document include:
-
- P1: Is the data dimensionality correct?
- P2: Have the "smooth" dimensions been identified?
- P3: Are the array dimensions correct?
- P4: Are the array dimensions large enough?
- P5: Is the data logically structured?
- P6: Is the data set embedded in a regular grid?
- P7: Is the data provided to the zfp executable a raw binary array?
- P8: Is the byte order correct?
- P9: Is the floating-point precision correct?
-P10: Is the integer precision correct?
-P11: Is the data provided to the zfp executable a raw binary array?
-P12: Has the appropriate compression mode been set?
-
--------------------------------------------------------------------------------
-
-P1: Is the data dimensionality correct?
-
-This is one of the most common problems.  First, make sure that zfp is given
-the correct dimensionality of the data.  For instance, an audio stream is a
-1D array, an image is a 2D array, and a volume grid is a 3D array.  Sometimes
-a data set is a discrete collection of lower-dimensional objects.  For
-instance, a stack of unrelated images (of the same size) could be represented
-in C as a 3D array
-
-  imstack[count][ny][nx]
-
-but since in this case the images are unrelated, no correlation would be
-expected along the third dimension--the underlying dimensionality of the data
-is in this case two.  In this case, the images could be compressed one at a
-time, or they could be compressed together by treating the array dimensions
-as
-
-  imstack[count * ny][nx]
-
-Note that zfp partitions d-dimensional arrays into blocks of 4^d values.
-If above ny is not a multiple of four, then some blocks of 4x4 pixels will
-contain pixels from different images, which could hurt compression and/or
-quality.  Still, this way of creating a single image by stacking multiple
-images is far preferable over linearizing each image into a 1D signal, and
-then compressing the images as
-
-  imstack[count][ny * nx]
-
-This loses the correlation along the y dimension, and further introduces
-discontinuities unless nx is a multiple of four.
-
-Similarly to the example above, a 2D vector field
-
-  vfield[ny][nx][2]
-
-could be declared as a 3D array, but the x- and y-components of the 2D
-vectors are likely entirely unrelated.  In this case, each component
-needs to be compressed independently, either by rearranging the data
-as two scalar fields
-
-  vfield[2][ny][nx]
-
-or by using strides (see also FAQ #1).  Note that in all these cases zfp will
-still compress the data, but if the dimensionality is not correct then the
-compression ratio will suffer.
-
--------------------------------------------------------------------------------
-
-P2: Have the "smooth" dimensions been identified?
-
-Closely related to Problem 1 above, some fields simply do not vary smoothly
-along all dimensions, and zfp can do a good job compressing only those
-dimensions that exhibit some coherence.  For instance, consider a table of
-stock prices indexed by date and stock:
-
-  price[stocks][dates]
-
-One could be tempted to compress this as a 2D array, but there is likely
-little to no correlation in prices between different stocks.  Each such
-time series should be compressed independently as a 1D signal.
-
-What about time-varying images like a video sequence?  In this case, it is
-likely that there is correlation over time, and that the value of a single
-pixel varies smoothly in time.  It is also likely that each image exhibits
-smoothness along its two spatial dimensions.  So this can be treated as a
-single, 3D data set.
-
-How about time-varying volumes, such as
-
-  field[nt][nz][ny][nx]
-
-zfp currently supports only 1D, 2D, and 3D arrays, whereas a time-varying
-volume is 4D.  Here the data should ideally be organized by the three
-"smoothest" dimensions.  Given the organization above, this could be
-compressed as a 3D array
-
-  field[nt * nz][ny][nx]
-
-Again, do not compress this as a 3D array
-
-  field[nt][nz][ny * nx]
-
--------------------------------------------------------------------------------
-
-P3: Are the array dimensions correct?
-
-This is another common problem that seems obvious, but surprisingly users often
-get this wrong.  Assuming that the smooth dimensions have been identified, it
-is important that the dimensions are not transposed.  For instance, if the data
-is organized as
-
-  field[d1][d2][d3]
-
-then the data is organized in memory (or on disk) with the d3 dimension varies
-fastest, and hence nx = d3, ny = d2, nz = d1 using the zfp naming conventions
-for the dimensions, e.g. the zfp executable should be invoked with
-
-  zfp -3 d3 d2 d1
-
-in this case.  Things will go horribly wrong if zfp in this case is called with
-nx = d1, ny = d2, nz = d3.  The entire data set will still compress and
-decompress, but compression ratio and quality will suffer greatly.
-
--------------------------------------------------------------------------------
-
-P4: Are the array dimensions large enough?
-
-zfp partitions d-dimensional data sets into blocks of 4^d values, e.g. in 3D
-a block consists of 4x4x4 values.  If the dimensions are not multiples of
-four, then zfp will "pad" the array to the next larger multiple of four.  Such
-padding can hurt compression.  In particular, if one or more of the array
-dimensions are small, then the overhead of such padding could be significant.
-
-Consider compressing a collection of 1000 small 3D arrays
-
-  field[1000][5][14][2]
-
-zfp would first logically pad this to a larger array
-
-  field[1000][8][16][4]
-
-which is (8*16*4) / (5*14*2) ~ 3.66 times larger.  Although such padding often
-compresses well, this still represents a large overhead.
-
-If a large array has been partitioned into smaller pieces, it may be best to
-reassemble the larger array.  Or, when possible, ensure that the sub-arrays
-have dimensions that are multiples of four.
-
--------------------------------------------------------------------------------
-
-P5: Is the data logically structured?
-
-zfp was designed for logically structured data, i.e. Cartesian grids.  It works
-much like an image compressor does, which assumes that the data set is a
-structured array of pixels, and it assumes that values vary reasonably smoothly
-on average, just like natural images tend to contain large regions of uniform
-color or smooth color gradients, like a blue sky, smoothly varying skin tones
-of a human's face, etc.  Many data sets are not represented on a regular grid.
-For instance, an array of particle xyz positions
-
-  points[count][3]
-
-is a 2D array, but does not vary smoothly in either dimension.  Furthermore,
-such unstructured data sets need not be organized in any particular order;
-the particles could be listed in any arbitrary order.  One could attempt to
-sort the particles, for example by the x coordinate, to promote smoothness,
-but this would still leave the other two dimensions non-smooth.
-
-Sometimes the underlying dimensions are not even known, and only the total
-number of floating-point values is known.  For example, suppose we only knew
-that the data set contained n = count * 3 values.  One might be tempted to
-compress this using zfp's 1-dimensional compressor, but once again this would
-not work well.  Such abuse of zfp is much akin to trying to compress an image
-using an audio compressor like mp3, or like compressing an n-sample piece of
-music as an n-by-one sized image using an image compressor like JPEG.  The
-results would likely not be very good.
-
-Some data sets are logically structured but geometrically irregular.  Examples
-include fields stored on Lagrangian meshes that have been warped, or on
-spectral element grids, which use a non-uniform grid spacing.  zfp assumes
-that the data has been regularly sampled in each dimension, and the more the
-geometry of the sampling deviates from uniform, the worse compression gets.
-Note that rectilinear grids with different but uniform grid spacing in each
-dimension are fine.  If your application uses very non-uniform sampling, then
-resampling onto a uniform grid (if possible) may be advisable.
-
-Other data sets are "block structured" and consist of piecewise structured
-grids that are "glued" together.  Rather than treating such data as
-unstructured 1D streams, consider partitioning the data set into independent
-(possibly overlapping) regular grids.
-
--------------------------------------------------------------------------------
-
-P6: Is the data set embedded in a regular grid?
-
-Some applications represent irregular geometry on a Cartesian grid, and leave
-portions of the domain unspecified.  Consider, for instance, sampling the
-density of the Earth onto a Cartesian grid.  Here the density for grid points
-outside the Earth is unspecified.
-
-In this case, zfp does best by initializing the "background field" to all
-zeros.  In zfp's fixed-accuracy mode (see the README file), any "empty" blocks
-that consist of all zeros are represented using a single bit, and therefore
-the overhead of representing empty space can be kept low.
-
--------------------------------------------------------------------------------
-
-P7: Have fill values, NaNs, and infinities been removed?
-
-It is common to signal unspecified values using what is commonly called a
-"fill value," which is a special constant value that tends to be far out of
-range of normal values.  For instance, in climate modeling the ocean
-temperature over land is meaningless, and it is common to use a very large
-temperature value such as 1e30 to signal that the temperature is undefined
-for such grid points.
-
-Very large fill values do not play well with zfp, because they both introduce
-artificial discontinuities and pollute nearby values by expressing them all
-with respect to the common largest exponent within their block.  Assuming
-a fill value of 1e20, the value pi in the same block would be represented as
-
-  0.0000000000000000000314159... * 1e20
-
-Given finite precision, the small fraction would likely be replaced with zero,
-resulting in complete loss of the actual value being stored.
-
-Other applications use NaNs (special not-a-number values) or infinities as
-fill values.  These are even more problematic, because they do not have a
-defined exponent.  zfp relies on the C function frexp() to compute the
-exponent of the largest (in magnitude) value within a block, but produces
-unspecified behavior if that value is not finite.  
-
-zfp currently has no independent mechanism for handling fill values.  Ideally
-such special values would be signalled separately, e.g. using a bit mask, 
-and then replaced with zeros to ensure that they both compress well and do
-not pollute actual data.
-
--------------------------------------------------------------------------------
-
-P8: Is the byte order correct?
-
-zfp generally works with the native byte order (e.g. little or big endian) of
-the machine it is compiled on.  One needs only be concerned with byte order
-when reading raw, binary data into the zfp executable, when exchanging
-compressed files across platforms, and when varying the bit stream word size
-on big endian machines (not common).  For instance, to compress a binary
-double-precision floating-point file stored in big endian byte order on a
-little endian machine, byte swapping must first be done.  For example, on
-Linux and OS X, 8-byte doubles can be byte swapped using
-
-  objcopy -I binary -O binary --reverse-bytes=8 big.bin little.bin
-
-See also FAQ #11 for more discussion of byte order.
-
--------------------------------------------------------------------------------
-
-P9: Is the floating-point precision correct?
-
-Another obvious problem: Please make sure that zfp is told whether the data to
-compress is an array of single- (32-bit) or double-precision (64-bit) values,
-e.g. by specifying the -f or -d options to the zfp executable or by passing
-the appropriate zfp_type to the C functions.
-
--------------------------------------------------------------------------------
-
-P10: Is the integer precision correct?
-
-zfp currently supports compression of 31- or 63-bit signed integers.  Shorter
-integers (e.g., bytes, shorts) can be compressed but must first be promoted
-to one of the longer types.  This should always be done using zfp's functions
-for promotion and demotion (see zfp.h), which both perform bit shifting and
-biasing to handle both signed and unsigned types.  It is not sufficient to
-simply cast short integers to longer integers.  See also FAQs #8 and #9.
-
--------------------------------------------------------------------------------
-
-P11: Is the data provided to the zfp executable a raw binary array?
-
-zfp expects that the input file is a raw binary array of integers or
-floating-point values in the IEEE format.  Do not hand zfp a text file
-containing ASCII floating-point numbers.  Strip the file of any header
-information.  Languages like Fortran tend to store with the array its size.
-No such metadata may be embedded in the file.
-
--------------------------------------------------------------------------------
-
-P12: Has the appropriate compression mode been set?
-
-zfp provides three different modes of compression that trade storage and
-accuracy.  In fixed-rate mode, the user specifies the exact number of bits
-(often in increments of a fraction of a bit) of compressed storage per value
-(but see FAQ #18 for caveats).  From the user's perspective, this seems a very
-desirable feature, since it provides for a direct mechanism for specifying
-how much storage to use.  However, there is often a large quality penalty
-associated with the fixed-rate mode, because each block of 4^d values is
-allocate the same number of bits.  In practice, the information content over
-the data set varies significantly, which means that easy-to-compress regions
-are assigned too many bits, while too few bits are available to faithfully
-represent the more challenging-to-compress regions.  Although one of the unique
-features of zfp, its fixed-rate mode should primarily be used only when
-random access to the data is needed.
-
-zfp also provides a fixed-precision mode, where the user specifies how many
-uncompressed significant bits to use to represent the floating-point fraction.
-This precision may not be exactly what people might normally think of.  For
-instance, the C float type is commonly referred to as 32-bit precision.
-However, the sign bit and exponent account for nine of those bits, and do
-not contribute to the number of significant bits of precision.  Furthermore,
-for normal numbers, IEEE uses a hidden implicit one bit, so most float values
-actually have 24 bits of precision.  Furthermore, zfp uses a block-floating-
-point representation that uses a single exponent per block, which may cause
-some small values to have several leading zero bits, and therefore less
-precision than requested.  Thus, the effective precision returned by zfp in
-its fixed-precision mode may in fact vary.  In practice, the precision
-requested is only an upper bound, though typically at least one value within
-a block has the requested precision.
-
-Finally, zfp supports a fixed-accuracy mode, which except in rare circumstances
-(see FAQ #17) ensures that the absolute error is bounded, i.e. the difference
-between any decompressed and original value is at most the tolerance specified
-by the user (but usually several times smaller).  Whenever possible, we
-recommend using this compression mode, which depending on how easy the data is
-to compress results in the smallest compressed stream that respects the error
-tolerance.
-
-There is also an expert mode that allows the user to combine the constraints
-of fixed rate, precision, and accuracy.  See the README and API files for
-more details.
diff --git a/README.md b/README.md
index 6a7f60c5a..ca183cf19 100644
--- a/README.md
+++ b/README.md
@@ -4,18 +4,21 @@ ZFP
 INTRODUCTION
 ------------
 
-This is zfp 0.5.1, an open source C/C++ library for compressed numerical
-arrays that support high throughput read and write random access.  zfp was
-written by Peter Lindstrom at Lawrence Livermore National Laboratory, and
-is loosely based on the algorithm described in the following paper:
+zfp is an open source C/C++ library for compressed numerical arrays that
+support high throughput read and write random access.  zfp also supports
+streaming compression of integer and floating-point data, e.g., for
+applications that read and write large data sets to and from disk.
+
+zfp was developed at Lawrence Livermore National Laboratory and is loosely
+based on the algorithm described in the following paper:
 
     Peter Lindstrom
     "Fixed-Rate Compressed Floating-Point Arrays"
-    IEEE Transactions on Visualization and Computer Graphics,
-      20(12):2674-2683, December 2014
+    IEEE Transactions on Visualization and Computer Graphics
+    20(12):2674-2683, December 2014
     doi:10.1109/TVCG.2014.2346458
 
-zfp was originally designed for floating-point data only, but has been
+zfp was originally designed for floating-point arrays only, but has been
 extended to also support integer data, and could for instance be used to
 compress images and quantized volumetric data.  To achieve high compression
 ratios, zfp uses lossy but optionally error-bounded compression.  Although
@@ -23,8 +26,8 @@ bit-for-bit lossless compression of floating-point data is not always
 possible, zfp is usually accurate to within machine epsilon in near-lossless
 mode.
 
-zfp works best for 2D and 3D arrays that exhibit spatial coherence, such
-as smooth fields from physics simulations, images, regularly sampled terrain
+zfp works best for 2D and 3D arrays that exhibit spatial correlation, such as
+continuous fields from physics simulations, images, regularly sampled terrain
 surfaces, etc.  Although zfp also provides a 1D array class that can be used
 for 1D signals such as audio, or even unstructured floating-point streams,
 the compression scheme has not been well optimized for this use case, and
@@ -32,9 +35,21 @@ rate and quality may not be competitive with floating-point compressors
 designed specifically for 1D streams.
 
 zfp is freely available as open source under a BSD license, as outlined in
-the file 'LICENSE'.  For information on the API and general usage, please
-see the file 'API' in this directory.  The file 'ISSUES' discusses common
-issues and serves as a troubleshooting guide.
+the file 'LICENSE'.  For more information on zfp and comparisons with other
+compressors, please see the zfp
+[website](https://computation.llnl.gov/projects/floating-point-compression).
+For questions, comments, requests, and bug reports, please contact
+[Peter Lindstrom](mailto:pl@llnl.gov).
+
+
+DOCUMENTATION
+-------------
+
+Full
+[documentation](http://zfp.readthedocs.io/en/release0.5.2/)
+is available online via Read the Docs.  A
+[PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release0.5.2/)
+version is also available.
 
 
 INSTALLATION
@@ -53,12 +68,12 @@ instructions on GNU and CMake builds.
 
 zfp has successfully been built and tested using these compilers:
 
-    gcc versions 4.4.7, 4.7.2, 4.8.2, 4.9.2, 5.3.1, 6.2.1
-    icc versions 12.0.5, 12.1.5, 15.0.4, 16.0.1
+    gcc versions 4.4.7, 4.7.2, 4.8.2, 4.9.2, 5.4.1, 6.3.0
+    icc versions 12.0.5, 12.1.5, 15.0.4, 16.0.1, 17.0.0, 18.0.0
     clang version 3.6.0
     xlc version 12.1
-    mingw32-gcc version 4.8.1
-    Visual Studio version 14.0
+    MinGW version 5.3.0
+    Visual Studio versions 14.0 (2015), 14.1 (2017)
 
 NOTE: zfp requires 64-bit compiler and operating system support.
 
@@ -110,388 +125,3 @@ To build zfp using Visual Studio on Windows, start an MSBuild shell and type
 
 This builds zfp in both debug and release mode.  See the instructions for
 Linux on how to change the cmake line to also build the example programs.
-
-
-ALGORITHM OVERVIEW
-------------------
-
-The zfp lossy compression scheme is based on the idea of breaking a
-d-dimensional array into independent blocks of 4^d values, e.g. 4x4x4
-values in three dimensions.  Each block is compressed/decompressed
-entirely independently from all other blocks.  In this sense, zfp is
-similar to current hardware texture compression schemes for image
-coding implemented on graphics cards and mobile devices.
-
-The compression scheme implemented in this version of zfp has evolved
-from the method described in the paper cited above, and can conceptually
-be thought of as consisting of eight sequential steps (in practice some
-steps are consolidated or exist only for illustrative purposes):
-
-1. The d-dimensional array is partitioned into blocks of dimensions 4^d.
-If the array dimensions are not multiples of four, then blocks near the
-boundary are padded to the next multiple of four.  This padding is
-invisible to the application.
-
-2. The independent floating-point values in a block are converted to what
-is known as a block-floating-point representation, which uses a single,
-common floating-point exponent for all 4^d values.  The effect of this
-conversion is to turn each floating-point value into a 31- or 63-bit
-signed integer.  If the values in the block are all zero or are smaller
-in magnitude than the fixed-accuracy tolerance (see below), then only a
-single bit is stored with the block to indicate that it is "empty" and
-expands to all zeros.  Note that the block-floating-point conversion and
-empty-block encoding are not performed if the input data is represented as
-integers rather than floating-point numbers.
-
-3. The integers are decorrelated using a custom, high-speed, near
-orthogonal transform similar to the discrete cosine transform used in
-JPEG image coding.  The transform exploits separability and is implemented
-efficiently in-place using the lifting scheme, requiring only 2.5*d
-integer additions and 1.5*d bit shifts by one per integer in d dimensions.
-If the data is "smooth," then this transform will turn most integers into
-small signed values clustered around zero.
-
-4. The signed integer coefficients are reordered in a manner similar to
-JPEG zig-zag ordering so that statistically they appear in a roughly
-monotonically decreasing order.  Coefficients corresponding to low
-frequencies tend to have larger magnitude, and are listed first.  In 3D,
-coefficients corresponding to frequencies i, j, k in the three dimensions
-are ordered by i + j + k first, and then by i^2 + j^2 + k^2.
-
-5. The two's complement signed integers are converted to their negabinary
-(base negative two) representation using one addition and one bit-wise
-exclusive or per integer.  Because negabinary has no dedicated single sign
-bit, these integers are subsequently treated as unsigned.
-
-6. The bits that represent the list of 4^d integers are transposed so
-that instead of being ordered by coefficient they are ordered by bit
-plane, from most to least significant bit.  Viewing each bit plane as an
-unsigned integer, with the lowest bit corresponding to the lowest frequency
-coefficient, the anticipation is that the first several of these transposed
-integers are small, because the coefficients are assumed to be ordered by
-magnitude.
-
-7. The transform coefficients are compressed losslessly using embedded
-coding by exploiting the property that the coefficients tend to have many
-leading zeros that need not be encoded explicitly.  Each bit plane is
-encoded in two parts, from lowest to highest bit.  First the n lowest bits
-are emitted verbatim, where n depends on previous bit planes and is
-initially zero.  Then a variable-length representation of the remaining
-4^d - n bits, x, is encoded.  For such an integer x, a single bit is
-emitted to indicate if x = 0, in which case we are done with the current
-bit plane.  If not, then bits of x are emitted, starting from the lowest
-bit, until a one bit is emitted.  This triggers another test whether this
-is the highest set bit of x, and the result of this test is output as a
-single bit.  If not, then the procedure repeats until all m of x's value
-bits have been output, where 2^(m-1) <= x < 2^m.  This can be thought of
-as a run-length encoding of the zeros of x, where the run lengths are
-expressed in unary.  The total number of value bits, n, in this bit plane
-is then incremented by m before being passed to the next bit plane, which
-is encoded by first emitting its n lowest bits.  The assumption is that
-these bits correspond to n coefficients whose most significant bits have
-already been output, i.e. these n bits are essentially random and not
-compressible.  Following this, the remaining 4^d - n bits of the bit plane
-are run-length encoded as described above, which potentially results in n
-being increased.
-
-8. The embedded coder emits one bit at a time, with each successive bit
-potentially improving the quality of the reconstructed signal.  The early
-bits are most important and have the greatest impact on signal quality,
-with the last few bits providing very small changes.  The resulting
-compressed bit stream can be truncated at any point and still allow for a
-valid approximate reconstruction of the original signal.  The final step
-truncates the bit stream in one of three ways: to a fixed number of bits
-(the fixed-rate mode); after some fixed number of bit planes have been
-encoded (the fixed-precision mode); or until a lowest bit plane number
-has been encoded, as expressed in relation to the common floating-point
-exponent within the block (the fixed-accuracy mode).
-
-Various parameters are exposed for controlling the quality and compressed
-size of a block, and can be specified by the user at a very fine
-granularity.  These parameters are discussed below.
-
-
-ZFP COMMAND LINE TOOL
----------------------
-
-The 'zfp' executable in the bin directory is primarily intended for
-evaluating the rate-distortion (compression ratio and quality) provided by
-the compressor, but since version 0.5.0 also allows reading and writing
-compressed data sets.  zfp takes as input a raw, binary array of floats or
-doubles in native byte order, and optionally outputs a compressed or
-reconstructed array obtained after lossy compression followed by
-decompression.  Various statistics on compression rate and error are also
-displayed.
-
-zfp requires a set of command-line options, the most important being the
--i option that specifies that the input is uncompressed.  When present,
-"-i <file>" tells zfp to read the uncompressed input file and compress it
-to memory.  If desired, the compressed stream can be written to an output
-file using "-z <file>".  When -i is absent, on the other hand, -z names
-the compressed input (not output) file, which is then decompressed.  In
-either case, "-o <file>" can be used to output the reconstructed array
-resulting from lossy compression and decompression.
-
-So, to compress a file, use "-i file.in -z file.zfp".  To later decompress
-the file, use "-z file.zfp -o file.out".  A single dash "-" can be used in
-place of a file name to denote standard input or output.
-
-When reading uncompressed input, the floating-point precision (single or
-double) must be specified using either -f (float) or -d (double).  In
-addition, the array dimensions must be specified using "-1 nx" (for 1D
-arrays), "-2 nx ny" (for 2D arrays), or "-3 nx ny nz" (for 3D arrays).
-For multidimensional arrays, x varies faster than y, which in turn varies
-faster than z.  That is, a 3D input file should correspond to a flattened
-C array declared as a[nz][ny][nx].
-
-Note that "-2 nx ny" is not equivalent to "-3 nx ny 1", even though the
-same number of values are compressed.  One invokes the 2D codec, while the
-other uses the 3D codec, which in this example has to pad the input to an
-nx * ny * 4 array since arrays are partitioned into blocks of dimensions
-4^d.  Such padding usually negatively impacts compression.
-
-Using -h, the array dimensions and type are stored in a header of the
-compressed stream so that they do not have to be specified on the command
-line during decompression.  The header also stores compression parameters,
-which are described below.
-
-zfp accepts several options for specifying how the data is to be compressed.
-The most general of these, the -c option, takes four constraint parameters
-that together can be used to achieve various effects.  These constraints
-are:
-
-    minbits: the minimum number of bits used to represent a block
-    maxbits: the maximum number of bits used to represent a block
-    maxprec: the maximum number of bit planes encoded
-    minexp:  the smallest bit plane number encoded
-
-Options -r, -p, and -a provide a simpler interface to setting all of
-the above parameters (see below).  Bit plane e refers to those bits whose
-place value is 2^e.  For instance, in single precision, bit planes -149
-through 127 are supported (when also counting denormalized numbers); for
-double precision, bit planes -1074 through 1023 are supported.
-
-Care must be taken to allow all constraints to be met, as encoding
-terminates as soon as a single constraint is violated (except minbits,
-which is satisfied at the end of encoding by padding zeros).  The effects
-of the above four parameters are best explained in terms of the three main
-compression modes supported by zfp (see Algorithm Overview above for
-additional details):
-
-  Fixed rate (option -r):
-    In fixed-rate mode, each compressed block of 4^d values in d dimensions
-    is stored using a fixed number of bits specified by the user.  This can
-    be achieved using option -c by setting minbits = maxbits, maxprec = 64,
-    and minexp = -1074.  The fixed-rate mode is needed to support random
-    access to blocks, where the amortized number of bits used per value is
-    given by rate = maxbits / 4^d.  Note that each block stores a leading
-    all-zeros bit and common exponent, and maxbits must be at least 9 for
-    single precision and 12 for double precision.
-
-  Fixed precision (option -p):
-    In fixed-precision mode, the number of bits used to encode a block may
-    vary, but the number of bit planes (i.e. the precision) encoded for the
-    transform coefficients is fixed.  This mode is achieved by specifying
-    the precision in maxprec and fully relaxing the size constraints, i.e.
-    minbits = 0, maxbits = 4171, and minexp = -1074.  Fixed-precision
-    mode is preferable when relative rather than absolute errors matter.
-
-  Fixed accuracy (option -a):
-    In fixed-accuracy mode, all transform coefficient bit planes up to a
-    minimum bit plane number are encoded.  (The actual minimum bit plane
-    is not necessarily minexp, but depends on the dimensionality of the
-    data.  The reason for this is that the inverse transform incurs range
-    expansion, and the amount of expansion depends on the number of
-    dimensions.)  Thus, minexp should be interpreted as the base-2 logarithm
-    of an absolute error tolerance.  In other words, given an uncompressed
-    value f and a reconstructed value g, the absolute difference |f - g|
-    is guaranteed to be at most 2^minexp.  (Note that it is not possible to
-    guarantee error tolerances smaller than machine epsilon relative to the
-    largest value within a block.)  This error tolerance is not always tight
-    (especially for 3D arrays), but can conservatively be set so that even
-    for worst-case inputs the error tolerance is respected.  To achieve
-    fixed accuracy to within 'tolerance', use the -a <tolerance> option,
-    which sets minexp = floor(log2(tolerance)), minbits = 0, maxbits = 4171,
-    and maxprec = 64.  As in fixed-precision mode, the number of bits used
-    per block is not fixed but is dictated by the data.  Use -a 0 to achieve
-    near-lossless compression.  Fixed-accuracy mode gives the highest quality
-    (in terms of absolute error) for a given compression rate, and is
-    preferable when random access is not needed.
-
-As mentioned above, other combinations of constraints can be used.
-For example, to ensure that the compressed stream is not larger than
-the uncompressed one, or that it fits within the amount of memory
-allocated, one may in conjunction with other constraints set
-maxbits = 4^d * CHAR_BIT * sizeof(Type), where Type is either float or
-double.  The minbits parameter is useful only in fixed-rate mode--when
-minbits = maxbits, zero-bits are padded to blocks that compress to fewer
-than maxbits bits.
-
-
-CODE EXAMPLES
--------------
-
-The 'examples' directory includes five programs that make use of the
-compressor.
-
-The 'simple' program is a minimal example that shows how to call the
-compressor and decompressor on a double-precision 3D array.  Without
-the '-d' option, it will compress the array and write the compressed
-stream to standard output.  With the '-d' option, it will instead
-read the compressed stream from standard input and decompress the
-array:
-
-    simple > compressed.zfp
-    simple -d < compressed.zfp
-
-For a more elaborate use of the compressor, see the 'zfp' utility.
-
-The 'diffusion' example is a simple forward Euler solver for the heat
-equation on a 2D regular grid, and is intended to show how to declare
-and work with zfp's compressed arrays, as well as give an idea of how
-changing the compression rate affects the error in the solution.  The
-usage is:
-
-    diffusion-zfp [rate] [nx] [ny] [nt]
-
-where 'rate' specifies the exact number of compressed bits to store per
-double-precision floating-point value (default = 64); 'nx' and 'ny'
-specify the grid size (default = 100x100); and 'nt' specifies the number
-of time steps to run (the default is to run until time t = 1).
-
-Running diffusion with the following arguments
-
-    diffusion-zfp 8
-    diffusion-zfp 12
-    diffusion-zfp 20
-    diffusion-zfp 64
-
-should result in this output
-
-    rate=8 sum=0.996442 error=4.813938e-07
-    rate=12 sum=0.998338 error=1.967777e-07
-    rate=20 sum=0.998326 error=1.967952e-07
-    rate=64 sum=0.998326 error=1.967957e-07
-
-For speed and quality comparison, diffusion-raw solves the same problem
-using uncompressed double-precision arrays.
-
-The 'speed' program takes two optional parameters:
-
-    speed [rate] [blocks]
-
-It measures the throughput of compression and decompression of 3D
-double-precision data (in megabytes of uncompressed data per second).
-By default, a rate of 1 bit/value and two million blocks are
-processed.
-
-The 'pgm' program illustrates how zfp can be used to compress grayscale
-images in the pgm format.  The usage is:
-
-    pgm <param> <input.pgm >output.pgm
-
-If param is positive, it is interpreted as the rate in bits per pixel,
-which ensures that each block of 4x4 pixels is compressed to a fixed
-number of bits, as in texture compression codecs.  If param is negative,
-then fixed-precision mode is used with precision -param, which tends to
-give higher quality for the same rate.  This use of zfp is not intended
-to compete with existing texture and image compression formats, but
-exists merely to demonstrate how to compress 8-bit integer data with zfp.
-
-The 'inplace' example shows how one might use zfp to perform in-place
-compression and decompression when memory is at a premium.  Here the
-floating-point array is overwritten with compressed data, which is later
-decompressed back in place.  This example also shows how to make use of
-some of the low-level features of zfp, such as its low-level, block-based
-compression API and bit stream functions that perform seeks on the bit
-stream.  The program takes one optional argument:
-
-    inplace [tolerance]
-
-which specifies the fixed-accuracy absolute tolerance to use during
-compression.  Please see FAQ #19 for more on the limitations of in-place
-compression.
-
-
-REGRESSION TESTING
-------------------
-
-The 'testzfp' program in the 'tests' directory performs regression testing
-that exercises most of the functionality of libzfp and the array classes.
-The tests assume the default compiler settings, i.e. with none of the
-macros in Config defined.  By default, small, pregenerated floating-point
-arrays are used in the test, since they tend to have the same binary
-representation across platforms, whereas it can be difficult to
-computationally generate bit-for-bit identical arrays.  To test larger
-arrays, modify the TESTZFP_* macros in Config.  When large arrays are
-used, the (de)compression throughput is also measured and reported in
-number of uncompressed bytes per second.
-
-
-LIMITATIONS AND MISSING FEATURES
---------------------------------
-
-zfp is released as a beta version with the intent of giving people access
-to the code and soliciting feedback on how to improve zfp for the first
-full release.  As such, the zfp API is experimental and has not been
-fixed, and it is entirely possible that future versions will employ a
-different API or even a different codec.
-
-Below is a list of known limitations and desirable features that may make
-it into future versions of zfp.
-
-- The current version of zfp allows for near lossless compression through
-  suitable parameter choices, but no guarantees are made that bit-for-bit
-  lossless compression is achieved.  We envision supporting lossless
-  compression in a future version by compressing the difference between
-  the original data and nearly losslessly compressed data.
-
-- Special values like infinity and NaN are not supported.  Denormalized
-  floating-point numbers are, however, correctly handled.  There is an
-  implicit assumption that floating point conforms to IEEE, though
-  extensions to other floating-point formats should be possible with
-  minor effort.
-
-- No iterators are provided for traversing an array, and currently one
-  has to use integer indexing.  Performance could in cases be improved
-  by limiting the traversal to sequential access.
-
-- It is not possible to access subarrays via pointers, e.g. via
-  double* p = &a[offset]; p[i] = ...  A pointer proxy class similar to
-  the reference class will be added in the near future.
-
-- There currently is no way to make a complete copy of a compressed
-  array, i.e. a = b; does not work for arrays a and b.  Copy constructors
-  and assignment operators will be added in the near future.
-
-- zfp can potentially provide higher precision than conventional float
-  and double arrays, but the interface currently does not expose this.
-  For example, such added precision could be useful in finite difference
-  computations, where catastrophic cancellation can be an issue when
-  insufficient precision is available.
-
-- Only single and double precision types are supported.  Generalizations
-  to IEEE half and quad precision would be useful.  For instance,
-  compressed 64-bit-per-value storage of 128-bit quad-precision numbers
-  could greatly improve the accuracy of double-precision floating-point
-  computations using the same amount of storage.
-
-- Complex-valued arrays are not directly supported.  Real and imaginary
-  components must be stored as separate arrays, which may result in lost
-  opportunities for compression, e.g. if the complex magnitude is constant
-  and only the phase varies.
-
-- zfp arrays are not thread-safe.  We are considering options for
-  supporting multi-threaded access, e.g. for OpenMP parallelization.
-
-- This version of zfp does not run on the GPU.  Some work has been done to
-  port zfp to CUDA, and we expect to release such a version in the future.
-
-
-QUESTIONS, COMMENTS, AND BUG REPORTS
-------------------------------------
-
-For bug reports, questions, and suggestions for improvements, please
-contact Peter Lindstrom at pl@llnl.gov.  If you end up using zfp in an
-application, please consider sharing with the author your success story
-and/or any issues encountered.
diff --git a/VERSIONS b/VERSIONS
index 5c9c75f38..f845b7d90 100644
--- a/VERSIONS
+++ b/VERSIONS
@@ -1,3 +1,25 @@
+zfp 0.5.2, September 28, 2017
+
+  - Added iterators and proxy objects for pointers and references.
+
+  - Added example illustrating how to use iterators and pointers.
+
+  - Modified diffusion example to optionally use iterators.
+
+  - Moved internal headers under array to array/zfp.
+
+  - Modified 64-bit integer typedefs to avoid the C89 non-compliant long long
+    and allow for user-supplied types and literal suffixes.
+
+  - Renamed compile-time macros that did not have a ZFP prefix.
+
+  - Fixed issue with setting stream word type via CMake.
+
+  - Rewrote documentation in reStructuredText and added complete
+    documentation of all public functions, classes, types, and macros.
+    Removed ASCII documentation.
+
+
 zfp 0.5.1, March 28, 2017
 
   - This release primarily fixes a few minor issues but also includes
diff --git a/appveyor.yml b/appveyor.yml
index f0811f309..b07dbed41 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -2,12 +2,81 @@ version: 0.5.1-{build}
 
 environment:
   matrix:
-    - GENERATOR: Visual Studio 14 2015 Win64
+    - COMPILER: mingw
+      GENERATOR: MinGW Makefiles
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: mingw
+      GENERATOR: MinGW Makefiles
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+    - COMPILER: mingw-w64
+      GENERATOR: MinGW Makefiles
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: mingw-w64
+      GENERATOR: MinGW Makefiles
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017 Win64
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017 Win64
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 15 2017
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015 Win64
+      PLATFORM: x64
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015 Win64
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015
+      PLATFORM: Win32
+      BUILD_TYPE: Debug
+
+    - COMPILER: msvc
+      GENERATOR: Visual Studio 14 2015
+      PLATFORM: Win32
+      BUILD_TYPE: Release
+
+install:
+  - if "%COMPILER%"=="mingw" set PATH=C:\MinGW\bin;%PATH%
+  - if "%COMPILER%"=="mingw-w64" set PATH=C:\MinGW\bin;%PATH%
 
 build_script:
   - mkdir build
   - cd build
-  - cmake -G "%GENERATOR%" ..
-  - cmake --build .
-  - ctest -V -C "Debug"
+  - cmake -G "%GENERATOR%" -DCMAKE_BUILD_TYPE="%BUILD_TYPE%" -DCMAKE_SH=CMAKE_SH-NOTFOUND ..
+
+  - if "%COMPILER%"=="msvc" cmake --build . --config "%BUILD_TYPE%"
+  - if not "%COMPILER%"=="msvc" cmake --build .
+
+  - ctest -V -C "%BUILD_TYPE%"
 
diff --git a/array/cache.h b/array/zfp/cache.h
similarity index 91%
rename from array/cache.h
rename to array/zfp/cache.h
index 3359b7d46..8e5a88027 100644
--- a/array/cache.h
+++ b/array/zfp/cache.h
@@ -1,9 +1,9 @@
-#ifndef CACHE_H
-#define CACHE_H
+#ifndef ZFP_CACHE_H
+#define ZFP_CACHE_H
 
 #include "memory.h"
 
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
   // maintain stats on hit and miss rates
   #include <iostream>
 #endif
@@ -76,7 +76,7 @@ class Cache {
     {
       if (pair.line) {
         uint i;
-        for (i = pair.line - c->line + 1; i <= c->mask && !c->tag[i].used(); i++);
+        for (i = uint(pair.line - c->line) + 1; i <= c->mask && !c->tag[i].used(); i++);
         pair = (i <= c->mask ? Pair(c->line + i, c->tag[i]) : Pair(0, Tag()));
       }
     }
@@ -88,7 +88,7 @@ class Cache {
   Cache(uint minsize) : tag(0), line(0)
   {
     resize(minsize);
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
     std::cerr << "cache lines=" << mask + 1 << std::endl;
     hit[0][0] = hit[1][0] = miss[0] = back[0] = 0;
     hit[0][1] = hit[1][1] = miss[1] = back[1] = 0;
@@ -99,7 +99,7 @@ class Cache {
   {
     deallocate(tag);
     deallocate(line);
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
     std::cerr << "cache R1=" << hit[0][0] << " R2=" << hit[1][0] << " RM=" << miss[0] << " RB=" << back[0]
               <<      " W1=" << hit[0][1] << " W2=" << hit[1][1] << " WM=" << miss[1] << " WB=" << back[1] << std::endl;
 #endif
@@ -124,7 +124,7 @@ class Cache {
     uint i = primary(x);
     if (tag[i].index() == x)
       return line + i;
-#ifdef CACHE_TWOWAY
+#ifdef ZFP_WITH_CACHE_TWOWAY
     uint j = secondary(x);
     if (tag[j].index() == x)
       return line + j;
@@ -142,19 +142,19 @@ class Cache {
       ptr = line + i;
       if (write)
         tag[i].mark();
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
       hit[0][write]++;
 #endif
       return tag[i];
     }
-#ifdef CACHE_TWOWAY
+#ifdef ZFP_WITH_CACHE_TWOWAY
     uint j = secondary(x);
     if (tag[j].index() == x) {
       ptr = line + j;
       if (write)
         tag[j].mark();
-#ifdef CACHE_PROFILE
-      shit[write]++;
+#ifdef ZFP_WITH_CACHE_PROFILE
+      hit[1][write]++;
 #endif
       return tag[j];
     }
@@ -164,7 +164,7 @@ class Cache {
     ptr = line + i;
     Tag t = tag[i];
     tag[i] = Tag(x, write);
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
     miss[write]++;
     if (tag[i].dirty())
       back[write]++;
@@ -182,7 +182,7 @@ class Cache {
   // flush cache line
   void flush(const Line* l)
   {
-    uint i = l - line;
+    uint i = uint(l - line);
     tag[i].clear();
   }
 
@@ -193,7 +193,7 @@ class Cache {
   uint primary(Index x) const { return x & mask; }
   uint secondary(Index x) const
   {
-#ifdef CACHE_FAST_HASH
+#ifdef ZFP_WITH_CACHE_FAST_HASH
     // max entropy hash for 26- to 16-bit mapping (not full avalanche)
     x -= x <<  7;
     x ^= x >> 16;
@@ -214,7 +214,7 @@ class Cache {
   Index mask; // cache line mask
   Tag* tag;   // cache line tags
   Line* line; // actual decompressed cache lines
-#ifdef CACHE_PROFILE
+#ifdef ZFP_WITH_CACHE_PROFILE
   uint64 hit[2][2]; // number of primary/secondary read/write hits
   uint64 miss[2];   // number of read/write misses
   uint64 back[2];   // number of write-backs due to read/writes
diff --git a/array/memory.h b/array/zfp/memory.h
similarity index 76%
rename from array/memory.h
rename to array/zfp/memory.h
index 1b4e4ec4d..01af4ef3f 100644
--- a/array/memory.h
+++ b/array/zfp/memory.h
@@ -1,5 +1,5 @@
-#ifndef MEMORY_H
-#define MEMORY_H
+#ifndef ZFP_MEMORY_H
+#define ZFP_MEMORY_H
 
 #include <cstdlib>
 #include "zfp/types.h"
@@ -7,7 +7,7 @@
 inline void*
 allocate(size_t size, size_t alignment = 0)
 {
-#if defined(__USE_XOPEN2K) && defined(ALIGNED_ALLOC)
+#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC)
   void* ptr;
   if (alignment > 1)
     posix_memalign(&ptr, alignment, size);
@@ -23,7 +23,7 @@ template <typename T>
 inline void
 deallocate(T* ptr)
 {
-#if defined(__USE_XOPEN2K) && defined(ALIGNED_ALLOC)
+#if defined(__USE_XOPEN2K) && defined(ZFP_WITH_ALIGNED_ALLOC)
   if (ptr)
     free(ptr);
 #else
diff --git a/array/zfparray.h b/array/zfparray.h
index 878234d05..8aa426aef 100644
--- a/array/zfparray.h
+++ b/array/zfparray.h
@@ -4,7 +4,7 @@
 #include <algorithm>
 #include <climits>
 #include "zfp.h"
-#include "memory.h"
+#include "zfp/memory.h"
 
 namespace zfp {
 
diff --git a/array/zfparray1.h b/array/zfparray1.h
index 1511e44de..71c88b1eb 100644
--- a/array/zfparray1.h
+++ b/array/zfparray1.h
@@ -1,9 +1,11 @@
 #ifndef ZFP_ARRAY1_H
 #define ZFP_ARRAY1_H
 
+#include <cstddef>
+#include <iterator>
 #include "zfparray.h"
 #include "zfpcodec.h"
-#include "cache.h"
+#include "zfp/cache.h"
 
 namespace zfp {
 
@@ -99,6 +101,8 @@ class array1 : public array {
     cache.clear();
   }
 
+  class pointer;
+
   // reference to a single array value
   class reference {
   public:
@@ -109,13 +113,91 @@ class array1 : public array {
     reference operator-=(Scalar val) { array->sub(i, val); return *this; }
     reference operator*=(Scalar val) { array->mul(i, val); return *this; }
     reference operator/=(Scalar val) { array->div(i, val); return *this; }
+    pointer operator&() const { return pointer(*this); }
+    // swap two array elements via proxy references
+    friend void swap(reference a, reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
   protected:
     friend class array1;
-    reference(array1* array, uint i) : array(array), i(i) {}
+    friend class iterator;
+    explicit reference(array1* array, uint i) : array(array), i(i) {}
     array1* array;
     uint i;
   };
 
+  // pointer to a single array value
+  class pointer {
+  public:
+    pointer() : ref(0, 0) {}
+    pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; return *this; }
+    reference operator*() const { return ref; }
+    reference operator[](ptrdiff_t d) const { return *operator+(d); }
+    pointer& operator++() { increment(); return *this; }
+    pointer& operator--() { decrement(); return *this; }
+    pointer operator++(int) { pointer p = *this; increment(); return p; }
+    pointer operator--(int) { pointer p = *this; decrement(); return p; }
+    pointer operator+=(ptrdiff_t d) { ref.i += d; return *this; }
+    pointer operator-=(ptrdiff_t d) { ref.i -= d; return *this; }
+    pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+    pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+    ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+    bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i; }
+    bool operator!=(const pointer& p) const { return !operator==(p); }
+  protected:
+    friend class array1;
+    friend class reference;
+    explicit pointer(reference r) : ref(r) {}
+    explicit pointer(array1* array, uint i) : ref(array, i) {}
+    ptrdiff_t index() const { return ref.i; }
+    void set(ptrdiff_t index) { ref.i = index; }
+    void increment() { ref.i++; }
+    void decrement() { ref.i--; }
+    reference ref;
+  };
+
+  // random access iterator that visits array block by block
+  class iterator {
+  public:
+    // typedefs for STL compatibility
+    typedef Scalar value_type;
+    typedef ptrdiff_t difference_type;
+    typedef typename array1::reference reference;
+    typedef typename array1::pointer pointer;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator() : ref(0, 0) {}
+    iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; return *this; }
+    reference operator*() const { return ref; }
+    reference operator[](difference_type d) const { return *operator+(d); }
+    iterator& operator++() { increment(); return *this; }
+    iterator& operator--() { decrement(); return *this; }
+    iterator operator++(int) { iterator it = *this; increment(); return it; }
+    iterator operator--(int) { iterator it = *this; decrement(); return it; }
+    iterator operator+=(difference_type d) { ref.i += d; return *this; }
+    iterator operator-=(difference_type d) { ref.i -= d; return *this; }
+    iterator operator+(difference_type d) const { return iterator(ref.array, ref.i + d); }
+    iterator operator-(difference_type d) const { return iterator(ref.array, ref.i - d); }
+    difference_type operator-(const iterator& it) const { return static_cast<difference_type>(ref.i) - static_cast<difference_type>(it.ref.i); }
+    bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    bool operator<=(const iterator& it) const { return ref.array == it.ref.array && ref.i <= it.ref.i; }
+    bool operator>=(const iterator& it) const { return ref.array == it.ref.array && ref.i >= it.ref.i; }
+    bool operator<(const iterator& it) const { return !operator>=(it); }
+    bool operator>(const iterator& it) const { return !operator<=(it); }
+    uint i() const { return ref.i; }
+  protected:
+    friend class array1;
+    explicit iterator(array1* array, uint i) : ref(array, i) {}
+    void increment() { ref.i++; }
+    void decrement() { ref.i--; }
+    reference ref;
+  };
+
   // (i) accessors
   const Scalar& operator()(uint i) const { return get(i); }
   reference operator()(uint i) { return reference(this, i); }
@@ -124,6 +206,10 @@ class array1 : public array {
   const Scalar& operator[](uint index) const { return get(index); }
   reference operator[](uint index) { return reference(this, index); }
 
+  // random access iterators
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx); }
+
 protected:
   // cache line representing one block of decompressed values
   class CacheLine {
diff --git a/array/zfparray2.h b/array/zfparray2.h
index 58fd4bc7a..04a608f6f 100644
--- a/array/zfparray2.h
+++ b/array/zfparray2.h
@@ -1,9 +1,11 @@
 #ifndef ZFP_ARRAY2_H
 #define ZFP_ARRAY2_H
 
+#include <cstddef>
+#include <iterator>
 #include "zfparray.h"
 #include "zfpcodec.h"
-#include "cache.h"
+#include "zfp/cache.h"
 
 namespace zfp {
 
@@ -108,6 +110,8 @@ class array2 : public array {
     cache.clear();
   }
 
+  class pointer;
+
   // reference to a single array value
   class reference {
   public:
@@ -118,13 +122,107 @@ class array2 : public array {
     reference operator-=(Scalar val) { array->sub(i, j, val); return *this; }
     reference operator*=(Scalar val) { array->mul(i, j, val); return *this; }
     reference operator/=(Scalar val) { array->div(i, j, val); return *this; }
+    pointer operator&() const { return pointer(*this); }
+    // swap two array elements via proxy references
+    friend void swap(reference a, reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
   protected:
     friend class array2;
-    reference(array2* array, uint i, uint j) : array(array), i(i), j(j) {}
+    friend class iterator;
+    explicit reference(array2* array, uint i, uint j) : array(array), i(i), j(j) {}
     array2* array;
     uint i, j;
   };
 
+  // pointer to a single value in flattened array
+  class pointer {
+  public:
+    pointer() : ref(0, 0, 0) {}
+    pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; return *this; }
+    reference operator*() const { return ref; }
+    reference operator[](ptrdiff_t d) const { return *operator+(d); }
+    pointer& operator++() { increment(); return *this; }
+    pointer& operator--() { decrement(); return *this; }
+    pointer operator++(int) { pointer p = *this; increment(); return p; }
+    pointer operator--(int) { pointer p = *this; decrement(); return p; }
+    pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
+    pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
+    pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+    pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+    ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+    bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j; }
+    bool operator!=(const pointer& p) const { return !operator==(p); }
+  protected:
+    friend class array2;
+    friend class reference;
+    explicit pointer(reference r) : ref(r) {}
+    explicit pointer(array2* array, uint i, uint j) : ref(array, i, j) {}
+    ptrdiff_t index() const { return ref.i + ref.array->nx * ref.j; }
+    void set(ptrdiff_t index) { ref.array->ij(ref.i, ref.j, index); }
+    void increment()
+    {
+      if (++ref.i == ref.array->nx) {
+        ref.i = 0;
+        ref.j++;
+      }
+    }
+    void decrement()
+    {
+      if (!ref.i--) {
+        ref.i = ref.array->nx - 1;
+        ref.j--;
+      }
+    }
+    reference ref;
+  };
+
+  // forward iterator that visits array block by block
+  class iterator {
+  public:
+    // typedefs for STL compatibility
+    typedef Scalar value_type;
+    typedef ptrdiff_t difference_type;
+    typedef typename array2::reference reference;
+    typedef typename array2::pointer pointer;
+    typedef std::forward_iterator_tag iterator_category;
+
+    iterator() : ref(0, 0, 0) {}
+    iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; return *this; }
+    reference operator*() const { return ref; }
+    iterator& operator++() { increment(); return *this; }
+    iterator operator++(int) { iterator it = *this; increment(); return it; }
+    bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    uint i() const { return ref.i; }
+    uint j() const { return ref.j; }
+  protected:
+    friend class array2;
+    explicit iterator(array2* array, uint i, uint j) : ref(array, i, j) {}
+    void increment()
+    {
+      ref.i++;
+      if (!(ref.i & 3u) || ref.i == ref.array->nx) {
+        ref.i = (ref.i - 1) & ~3u;
+        ref.j++;
+        if (!(ref.j & 3u) || ref.j == ref.array->ny) {
+          ref.j = (ref.j - 1) & ~3u;
+          // done with block; advance to next
+          if ((ref.i += 4) >= ref.array->nx) {
+            ref.i = 0;
+            if ((ref.j += 4) >= ref.array->ny)
+              ref.j = ref.array->ny;
+          }
+        }
+      }
+    }
+    reference ref;
+  };
+
   // (i, j) accessors
   const Scalar& operator()(uint i, uint j) const { return get(i, j); }
   reference operator()(uint i, uint j) { return reference(this, i, j); }
@@ -143,6 +241,10 @@ class array2 : public array {
     return reference(this, i, j);
   }
 
+  // sequential iterators
+  iterator begin() { return iterator(this, 0, 0); }
+  iterator end() { return iterator(this, 0, ny); }
+
 protected:
   // cache line representing one block of decompressed values
   class CacheLine {
diff --git a/array/zfparray3.h b/array/zfparray3.h
index 3ed617558..68f1022aa 100644
--- a/array/zfparray3.h
+++ b/array/zfparray3.h
@@ -1,9 +1,11 @@
 #ifndef ZFP_ARRAY3_H
 #define ZFP_ARRAY3_H
 
+#include <cstddef>
+#include <iterator>
 #include "zfparray.h"
 #include "zfpcodec.h"
-#include "cache.h"
+#include "zfp/cache.h"
 
 namespace zfp {
 
@@ -114,6 +116,8 @@ class array3 : public array {
     cache.clear();
   }
 
+  class pointer;
+
   // reference to a single array value
   class reference {
   public:
@@ -124,13 +128,121 @@ class array3 : public array {
     reference operator-=(Scalar val) { array->sub(i, j, k, val); return *this; }
     reference operator*=(Scalar val) { array->mul(i, j, k, val); return *this; }
     reference operator/=(Scalar val) { array->div(i, j, k, val); return *this; }
+    pointer operator&() const { return pointer(*this); }
+    // swap two array elements via proxy references
+    friend void swap(reference a, reference b)
+    {
+      Scalar x = a.operator Scalar();
+      Scalar y = b.operator Scalar();
+      b.operator=(x);
+      a.operator=(y);
+    }
   protected:
     friend class array3;
-    reference(array3* array, uint i, uint j, uint k) : array(array), i(i), j(j), k(k) {}
+    friend class iterator;
+    explicit reference(array3* array, uint i, uint j, uint k) : array(array), i(i), j(j), k(k) {}
     array3* array;
     uint i, j, k;
   };
 
+  // pointer to a single value in flattened array
+  class pointer {
+  public:
+    pointer() : ref(0, 0, 0, 0) {}
+    pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; ref.k = p.ref.k; return *this; }
+    reference operator*() const { return ref; }
+    reference operator[](ptrdiff_t d) const { return *operator+(d); }
+    pointer& operator++() { increment(); return *this; }
+    pointer& operator--() { decrement(); return *this; }
+    pointer operator++(int) { pointer p = *this; increment(); return p; }
+    pointer operator--(int) { pointer p = *this; decrement(); return p; }
+    pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
+    pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
+    pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
+    pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
+    ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
+    bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j && ref.k == p.ref.k; }
+    bool operator!=(const pointer& p) const { return !operator==(p); }
+  protected:
+    friend class array3;
+    friend class reference;
+    explicit pointer(reference r) : ref(r) {}
+    explicit pointer(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
+    ptrdiff_t index() const { return ref.i + ref.array->nx * (ref.j + ref.array->ny * ref.k); }
+    void set(ptrdiff_t index) { ref.array->ijk(ref.i, ref.j, ref.k, index); }
+    void increment()
+    {
+      if (++ref.i == ref.array->nx) {
+        ref.i = 0;
+        if (++ref.j == ref.array->ny) {
+          ref.j = 0;
+          ref.k++;
+        }
+      }
+    }
+    void decrement()
+    {
+      if (!ref.i--) {
+        ref.i = ref.array->nx - 1;
+        if (!ref.j--) {
+          ref.j = ref.array->ny - 1;
+          ref.k--;
+        }
+      }
+    }
+    reference ref;
+  };
+
+  // forward iterator that visits array block by block
+  class iterator {
+  public:
+    // typedefs for STL compatibility
+    typedef Scalar value_type;
+    typedef ptrdiff_t difference_type;
+    typedef typename array3::reference reference;
+    typedef typename array3::pointer pointer;
+    typedef std::forward_iterator_tag iterator_category;
+
+    iterator() : ref(0, 0, 0, 0) {}
+    iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; ref.k = it.ref.k; return *this; }
+    reference operator*() const { return ref; }
+    iterator& operator++() { increment(); return *this; }
+    iterator operator++(int) { iterator it = *this; increment(); return it; }
+    bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j && ref.k == it.ref.k; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    uint i() const { return ref.i; }
+    uint j() const { return ref.j; }
+    uint k() const { return ref.k; }
+  protected:
+    friend class array3;
+    explicit iterator(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
+    void increment()
+    {
+      ref.i++;
+      if (!(ref.i & 3u) || ref.i == ref.array->nx) {
+        ref.i = (ref.i - 1) & ~3u;
+        ref.j++;
+        if (!(ref.j & 3u) || ref.j == ref.array->ny) {
+          ref.j = (ref.j - 1) & ~3u;
+          ref.k++;
+          if (!(ref.k & 3u) || ref.k == ref.array->nz) {
+            ref.k = (ref.k - 1) & ~3u;
+            // done with block; advance to next
+            if ((ref.i += 4) >= ref.array->nx) {
+              ref.i = 0;
+              if ((ref.j += 4) >= ref.array->ny) {
+                ref.j = 0;
+                if ((ref.k += 4) >= ref.array->nz)
+                  ref.k = ref.array->nz;
+              }
+            }
+          }
+        }
+      }
+    }
+    reference ref;
+  };
+
   // (i, j, k) accessors
   const Scalar& operator()(uint i, uint j, uint k) const { return get(i, j, k); }
   reference operator()(uint i, uint j, uint k) { return reference(this, i, j, k); }
@@ -149,6 +261,10 @@ class array3 : public array {
     return reference(this, i, j, k);
   }
 
+  // sequential iterators
+  iterator begin() { return iterator(this, 0, 0, 0); }
+  iterator end() { return iterator(this, 0, 0, nz); }
+
 protected:
   // cache line representing one block of decompressed values
   class CacheLine {
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ae804704d..f4c300c14 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,15 +1,23 @@
-add_executable(diffusion-raw diffusion.cpp)
-target_compile_definitions(diffusion-raw PUBLIC WITHOUT_COMPRESSION)
-target_link_libraries(diffusion-raw zfp)
+add_executable(diffusion diffusion.cpp)
+target_link_libraries(diffusion zfp)
+target_compile_definitions(diffusion PRIVATE ${zfp_defs})
 
-add_executable(diffusion-zfp diffusion.cpp)
-target_link_libraries(diffusion-zfp zfp)
+add_executable(inplace inplace.c)
+target_link_libraries(inplace zfp)
+target_compile_definitions(inplace PRIVATE ${zfp_defs})
+
+add_executable(iterator iterator.cpp)
+target_link_libraries(iterator zfp)
+target_compile_definitions(iterator PRIVATE ${zfp_defs})
 
 add_executable(pgm pgm.c)
 target_link_libraries(pgm zfp)
+target_compile_definitions(pgm PRIVATE ${zfp_defs})
 
 add_executable(simple simple.c)
 target_link_libraries(simple zfp)
+target_compile_definitions(simple PRIVATE ${zfp_defs})
 
 add_executable(speed speed.c)
 target_link_libraries(speed zfp)
+target_compile_definitions(speed PRIVATE ${zfp_defs})
diff --git a/examples/Makefile b/examples/Makefile
index d5b25b49e..cb218dd3f 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,9 +1,9 @@
 include ../Config
 
 BINDIR = ../bin
-TARGETS = $(BINDIR)/diffusion-raw\
-	  $(BINDIR)/diffusion-zfp\
+TARGETS = $(BINDIR)/diffusion\
 	  $(BINDIR)/inplace\
+	  $(BINDIR)/iterator\
 	  $(BINDIR)/pgm\
 	  $(BINDIR)/simple\
 	  $(BINDIR)/speed
@@ -12,15 +12,15 @@ CXXLIBS = -L../lib -lzfp
 
 all: $(TARGETS)
 
-$(BINDIR)/diffusion-raw: diffusion.cpp ../lib/libzfp.a
-	$(CXX) $(CXXFLAGS) -DWITHOUT_COMPRESSION -I../array diffusion.cpp $(CXXLIBS) -o $@
-
-$(BINDIR)/diffusion-zfp: diffusion.cpp ../lib/libzfp.a
+$(BINDIR)/diffusion: diffusion.cpp ../lib/libzfp.a
 	$(CXX) $(CXXFLAGS) -I../array diffusion.cpp $(CXXLIBS) -o $@
 
 $(BINDIR)/inplace: inplace.c ../lib/libzfp.a
 	$(CC) $(CFLAGS) inplace.c $(CLIBS) -o $@
 
+$(BINDIR)/iterator: iterator.cpp ../lib/libzfp.a
+	$(CXX) $(CXXFLAGS) -I../array iterator.cpp $(CXXLIBS) -o $@
+
 $(BINDIR)/pgm: pgm.c ../lib/libzfp.a
 	$(CC) $(CFLAGS) pgm.c $(CLIBS) -o $@
 
diff --git a/examples/array2d.h b/examples/array2d.h
index 7b9f92f54..861fa25a9 100644
--- a/examples/array2d.h
+++ b/examples/array2d.h
@@ -7,19 +7,43 @@
 typedef unsigned int uint;
 
 // uncompressed 2D double-precision array (for comparison)
+namespace raw {
 class array2d {
 public:
-  array2d(uint nx, uint ny, uint precision) : nx(nx), ny(ny), data(nx * ny, 0.0) {}
+  array2d() : nx(0), ny(0) {}
+  array2d(uint nx, uint ny, double rate = 0.0, const double* p = 0, size_t csize = 0) : nx(nx), ny(ny), data(nx * ny, 0.0) {}
+  void resize(uint nx, uint ny) { this->nx = nx; this->ny = ny; data.resize(nx * ny, 0.0); }
   size_t size() const { return data.size(); }
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
   double rate() const { return CHAR_BIT * sizeof(double); }
+  size_t cache_size() const { return 0; }
   double& operator()(uint x, uint y) { return data[x + nx * y]; }
   const double& operator()(uint x, uint y) const { return data[x + nx * y]; }
   double& operator[](uint i) { return data[i]; }
   const double& operator[](uint i) const { return data[i]; }
+  class iterator {
+  public:
+    double& operator*() const { return array->operator[](index); }
+    iterator& operator++() { index++; return *this; }
+    iterator operator++(int) { iterator p = *this; index++; return p; }
+    bool operator==(const iterator& it) const { return array == it.array && index == it.index; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    uint i() const { return index % array->nx; }
+    uint j() const { return index / array->nx; }
+  protected:
+    friend class array2d;
+    iterator(array2d* array, uint index) : array(array), index(index) {}
+    array2d* array;
+    uint index;
+  };
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx * ny); }
 protected:
   uint nx;
   uint ny;
   std::vector<double> data;
 };
+}
 
 #endif
diff --git a/examples/diffusion.cpp b/examples/diffusion.cpp
index c0c1b4fc8..d29791bf3 100644
--- a/examples/diffusion.cpp
+++ b/examples/diffusion.cpp
@@ -6,102 +6,203 @@
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
+#include "zfparray2.h"
+#include "array2d.h"
 
-#ifdef WITHOUT_COMPRESSION
-  #include "array2d.h"
-#else
-  #include "zfparray2.h"
-  using namespace zfp;
-#endif
+// constants used in the solution
+class Constants {
+public:
+  Constants(int nx, int ny, int nt) :
+    nx(nx),
+    ny(ny),
+    nt(nt),
+    x0((nx - 1) / 2),
+    y0((ny - 1) / 2),
+    k(0.04),
+    dx(2.0 / (std::max(nx, ny) - 1)),
+    dy(2.0 / (std::max(nx, ny) - 1)),
+    dt(0.5 * (dx * dx + dy * dy) / (8 * k)),
+    tfinal(nt ? nt * dt : 1.0),
+    pi(3.14159265358979323846)
+  {}
 
-int main(int argc, char* argv[])
+  int nx;        // grid points in x
+  int ny;        // grid points in y
+  int nt;        // number of time steps (0 for default)
+  int x0;        // x location of heat source
+  int y0;        // y location of heat source
+  double k;      // diffusion constant
+  double dx;     // grid spacing in x
+  double dy;     // grid spacing in y
+  double dt;     // time step
+  double tfinal; // minimum time to run solution to
+  double pi;     // 3.141...
+};
+
+// advance solution using integer array indices
+template <class array2d>
+inline void
+time_step_indexed(array2d& u, const Constants& c)
 {
-  int nx = 0;
-  int ny = 0;
-  int nt = 0;
-  double rate = 64;
+  // compute du/dt
+  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
+  for (int y = 1; y < c.ny - 1; y++) {
+    for (int x = 1; x < c.nx - 1; x++) {
+      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
+      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
+      du(x, y) = c.dt * c.k * (uxx + uyy);
+    }
+  }
+  // take forward Euler step
+  for (uint i = 0; i < u.size(); i++)
+    u[i] += du[i];
+}
 
-  // parse arguments
-  switch (argc) {
-    case 5:
-      if (sscanf(argv[4], "%d", &nt) != 1)
-        goto usage;
-      // FALLTHROUGH
-    case 4:
-      if (sscanf(argv[2], "%d", &nx) != 1 ||
-          sscanf(argv[3], "%d", &ny) != 1)
-        goto usage;
-      // FALLTHROUGH
-    case 2:
-      if (sscanf(argv[1], "%lf", &rate) != 1)
-        goto usage;
-      // FALLTHROUGH
-    case 1:
-      break;
-    default:
-    usage:
-      std::cerr << "Usage: diffusion [rate] [nx] [ny] [nt]" << std::endl;
-      return EXIT_FAILURE;
+// advance solution using array iterators
+template <class array2d>
+inline void
+time_step_iterated(array2d& u, const Constants& c)
+{
+  // compute du/dt
+  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
+  for (typename array2d::iterator p = du.begin(); p != du.end(); p++) {
+    int x = p.i();
+    int y = p.j();
+    if (1 <= x && x <= c.nx - 2 &&
+        1 <= y && y <= c.ny - 2) {
+      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
+      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
+      *p = c.dt * c.k * (uxx + uyy);
+    }
   }
+  // take forward Euler step
+  for (typename array2d::iterator p = u.begin(), q = du.begin(); p != u.end(); p++, q++)
+    *p += *q;
+}
 
-  // grid dimensions
-  if (nx == 0)
-    nx = 100;
-  if (ny == 0)
-    ny = nx;
-
-  // location of point heat source
-  int x0 = (nx - 1) / 2;
-  int y0 = (ny - 1) / 2;
-
-  // constants used in the solution
-  const double k = 0.04;
-  const double dx = 2.0 / (std::max(nx, ny) - 1);
-  const double dy = 2.0 / (std::max(nx, ny) - 1);
-  const double dt = 0.5 * (dx * dx + dy * dy) / (8 * k);
-  const double tfinal = nt ? nt * dt : 1;
-  const double pi = 3.14159265358979323846;
-
-  // initialize u (constructor zero-initializes)
-  array2d u(nx, ny, rate);
-  rate = u.rate();
-  u(x0, y0) = 1;
+// solve heat equation using 
+template <class array2d>
+inline double
+solve(array2d& u, const Constants& c, bool iterator)
+{
+  // initialize u with point heat source (u is assumed to be zero initialized)
+  u(c.x0, c.y0) = 1;
 
   // iterate until final time
-  std::cerr.precision(6);
   double t;
-  for (t = 0; t < tfinal; t += dt) {
-    std::cerr << "t=" << std::fixed << t << std::endl;
-    // compute du/dt
-    array2d du(nx, ny, rate);
-    for (int y = 1; y < ny - 1; y++) {
-      for (int x = 1; x < nx - 1; x++) {
-        double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (dx * dx);
-        double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (dy * dy);
-        du(x, y) = dt * k * (uxx + uyy);
-      }
-    }
-    // take forward Euler step
-    for (uint i = 0; i < u.size(); i++)
-      u[i] += du[i];
+  for (t = 0; t < c.tfinal; t += c.dt) {
+    std::cerr << "t=" << std::setprecision(6) << std::fixed << t << std::endl;
+    if (iterator)
+      time_step_iterated(u, c);
+    else
+      time_step_indexed(u, c);
   }
 
-  // compute root mean square error with respect to exact solution
+  return t;
+}
+
+// compute sum of array values
+template <class array2d>
+inline double
+total(const array2d& u)
+{
+  double s = 0;
+  const int nx = u.size_x();
+  const int ny = u.size_y();
+  for (int y = 1; y < ny - 1; y++)
+    for (int x = 1; x < nx - 1; x++)
+      s += u(x, y);
+  return s;
+}
+
+// compute root mean square error with respect to exact solution
+template <class array2d>
+inline double
+error(const array2d& u, const Constants& c, double t)
+{
   double e = 0;
-  double sum = 0;
-  for (int y = 1; y < ny - 1; y++) {
-    double py = dy * (y - y0);
-    for (int x = 1; x < nx - 1; x++) {
-      double px = dx * (x - x0);
+  for (int y = 1; y < c.ny - 1; y++) {
+    double py = c.dy * (y - c.y0);
+    for (int x = 1; x < c.nx - 1; x++) {
+      double px = c.dx * (x - c.x0);
       double f = u(x, y);
-      double g = dx * dy * std::exp(-(px * px + py * py) / (4 * k * t)) / (4 * pi * k * t);
+      double g = c.dx * c.dy * std::exp(-(px * px + py * py) / (4 * c.k * t)) / (4 * c.pi * c.k * t);
       e += (f - g) * (f - g);
-      sum += f;
     }
   }
-  e = std::sqrt(e / ((nx - 2) * (ny - 2)));
+  return std::sqrt(e / ((c.nx - 2) * (c.ny - 2)));
+}
+
+inline int
+usage()
+{
+  std::cerr << "Usage: diffusion [options]" << std::endl;
+  std::cerr << "Options:" << std::endl;
+  std::cerr << "-i : traverse arrays using iterators" << std::endl;
+  std::cerr << "-n <nx> <ny> : number of grid points" << std::endl;
+  std::cerr << "-t <nt> : number of time steps" << std::endl;
+  std::cerr << "-r <rate> : use compressed arrays with 'rate' bits/value" << std::endl;
+  std::cerr << "-c <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl;
+  return EXIT_FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+  int nx = 100;
+  int ny = 100;
+  int nt = 0;
+  double rate = 64;
+  bool iterator = false;
+  bool compression = false;
+  int cache = 0;
+
+  // parse command-line options
+  for (int i = 1; i < argc; i++)
+    if (std::string(argv[i]) == "-i")
+      iterator = true;
+    else if (std::string(argv[i]) == "-n") {
+      if (++i == argc || sscanf(argv[i], "%i", &nx) != 1 ||
+          ++i == argc || sscanf(argv[i], "%i", &ny) != 1)
+        return usage();
+    }
+    else if (std::string(argv[i]) == "-t") {
+      if (++i == argc || sscanf(argv[i], "%i", &nt) != 1)
+        return usage();
+    }
+    else if (std::string(argv[i]) == "-r") {
+      if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
+        return usage();
+      compression = true;
+    }
+    else if (std::string(argv[i]) == "-c") {
+      if (++i == argc || sscanf(argv[i], "%i", &cache) != 1)
+        return usage();
+    }
+    else
+      return usage();
+
+  Constants c(nx, ny, nt);
+
+  double sum;
+  double err;
+  if (compression) {
+    // solve problem using compressed arrays
+    zfp::array2d u(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
+    rate = u.rate();
+    double t = solve(u, c, iterator);
+    sum = total(u);
+    err = error(u, c, t);
+  }
+  else {
+    // solve problem using uncompressed arrays
+    raw::array2d u(nx, ny);
+    double t = solve(u, c, iterator);
+    sum = total(u);
+    err = error(u, c, t);
+  }
+
   std::cerr.unsetf(std::ios::fixed);
-  std::cerr << "rate=" << rate << " sum=" << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << e << std::endl;
+  std::cerr << "rate=" << rate << " sum=" << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
 
   return 0;
 }
diff --git a/examples/iterator.cpp b/examples/iterator.cpp
new file mode 100644
index 000000000..c7623cd6d
--- /dev/null
+++ b/examples/iterator.cpp
@@ -0,0 +1,74 @@
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include "zfparray1.h"
+#include "zfparray2.h"
+#include "zfparray3.h"
+
+void print1(zfp::array1<double>::pointer p, size_t n)
+{
+  for (size_t i = 0; i < n; i++)
+    std::cout << p[i] << std::endl;
+}
+
+void print2(zfp::array2<double>::pointer p, size_t n)
+{
+  while (n--)
+    std::cout << *p++ << std::endl;
+}
+
+void print3(zfp::array1<double>::iterator begin, zfp::array1<double>::iterator end)
+{
+  for (zfp::array1<double>::iterator p = begin; p != end; p++)
+    std::cout << *p << std::endl;
+}
+
+int main()
+{
+  // some fun with 1D arrays
+  zfp::array1<double> v(10, 64.0);
+  // initialize and print array of random values
+  for (zfp::array1<double>::iterator p = v.begin(); p != v.end(); p++)
+    *p = drand48();
+  std::cout << "random array" << std::endl;
+  print1(&v[0], v.size());
+  std::cout << std::endl;
+  // sorting is possible via random access iterators (1D arrays only)
+  std::sort(v.begin(), v.end());
+  // print array using iteration
+  std::cout << "sorted array" << std::endl;
+  print3(v.begin(), v.end());
+  std::cout << std::endl;
+
+  // some fun with 2D arrays
+  zfp::array2<double> a(5, 7, 64.0);
+  // print array indices visited in block-order traversal
+  std::cout << "block order (x, y) indices" << std::endl;
+  for (zfp::array2<double>::iterator p = a.begin(); p != a.end(); p++) {
+    std::cout << "(" << p.i() << ", " << p.j() << ")" << std::endl;
+    *p = p.i() + 10 * p.j();
+  }
+  std::cout << std::endl;
+  // print array contents in row-major order
+  std::cout << "row-major order yx indices" << std::endl;
+  print2(&a[0], a.size());
+  std::cout << std::endl;
+  // pointer arithmetic
+  std::cout << a.size_x() << " * " << a.size_y() << " = " << (&*a.end() - &*a.begin()) << std::endl;
+  // min and max values
+  std::cout << "min = " << *std::min_element(a.begin(), a.end()) << std::endl;
+  std::cout << "max = " << *std::max_element(a.begin(), a.end()) << std::endl;
+  std::cout << std::endl;
+
+  // some fun with 3D arrays
+  zfp::array3<double> b(7, 2, 5, 64.0);
+  // print array indices visited in block-order traversal
+  std::cout << "block order (x, y, z) indices" << std::endl;
+  for (zfp::array3<double>::iterator p = b.begin(); p != b.end(); p++)
+    std::cout << "(" << p.i() << ", " << p.j() << ", " << p.k() << ")" << std::endl;
+  std::cout << std::endl;
+  // pointer arithmetic
+  std::cout << b.size_x() << " * " << b.size_y() << " * " << b.size_z() << " = " << (&*b.end() - &*b.begin()) << std::endl;
+
+  return 0;
+}
diff --git a/examples/speed.c b/examples/speed.c
index e3688a7bf..9332605d5 100644
--- a/examples/speed.c
+++ b/examples/speed.c
@@ -8,70 +8,70 @@
 
 /* example 3D block of (reinterpreted) doubles */
 static const uint64 block[] = {
-0xbf7c3a7bb8495ca9ull,
-0xbf79f9d9058ffdafull,
-0xbf77c7abd0b61999ull,
-0xbf75a42c806bd1daull,
-0xbf738f8f740b8ea8ull,
-0xbf718a050399fef8ull,
-0xbf6f2772ff8c30feull,
-0xbf6b59aa63d22f68ull,
-0xbf67aaf8b80cff9eull,
-0xbf641b9e71983592ull,
-0xbf60abd3f723f2b7ull,
-0xbf5ab7934169cc04ull,
-0xbf54574f6f4897d3ull,
-0xbf4c6e39da7fb99bull,
-0xbf40ae5826a893d1ull,
-0xbf25bce8e19d48e1ull,
-0x3f253bfed65904d7ull,
-0x3f3f18ab46a04cf3ull,
-0x3f4948e7cb74278bull,
-0x3f51427b51aeec2eull,
-0x3f55a0716d8b4b6bull,
-0x3f59be96aeaac56full,
-0x3f5d9d3ba7bfd327ull,
-0x3f609e608469e93eull,
-0x3f624ecbcfa3832cull,
-0x3f63e0202ae84b4dull,
-0x3f6552a61a3f4812ull,
-0x3f66a6ae305af268ull,
-0x3f67dc910e9935bcull,
-0x3f68f4af65036ff7ull,
-0x3f69ef71f24e7182ull,
-0x3f6acd4983da7d43ull,
-0x3f6b8eaef5b348a0ull,
-0x3f6c3423328ffb7aull,
-0x3f6cbe2f33d33034ull,
-0x3f6d2d64018af3acull,
-0x3f6d825ab270c540ull,
-0x3f6dbdb46be996ccull,
-0x3f6de01a6205cca9ull,
-0x3f6dea3dd7813dafull,
-0x3f6ddcd81dc33335ull,
-0x3f6db8aa94de690full,
-0x3f6d7e7eab910d8full,
-0x3f6d2f25df44c187ull,
-0x3f6ccb79bc0e9844ull,
-0x3f6c545bdcaf1795ull,
-0x3f6bcab5ea9237c4ull,
-0x3f6b2f799dcf639bull,
-0x3f6a83a0bd297862ull,
-0x3f69c82d1e0ec5deull,
-0x3f68fe28a4990e53ull,
-0x3f6826a5438d8685ull,
-0x3f6742bcfc5cd5b2ull,
-0x3f665391df231599ull,
-0x3f655a4e0aa7d278ull,
-0x3f645823ac5e0b09ull,
-0x3f634e4d00643085ull,
-0x3f623e0c518426a3ull,
-0x3f6128abf933439aull,
-0x3f600f7e5f92501cull,
-0x3f5de7bbf6db0eb7ull,
-0x3f5bae5aa4792e11ull,
-0x3f5975adf0453ea2ull,
-0x3f57409b1fdc65c4ull,
+UINT64C(0xbf7c3a7bb8495ca9),
+UINT64C(0xbf79f9d9058ffdaf),
+UINT64C(0xbf77c7abd0b61999),
+UINT64C(0xbf75a42c806bd1da),
+UINT64C(0xbf738f8f740b8ea8),
+UINT64C(0xbf718a050399fef8),
+UINT64C(0xbf6f2772ff8c30fe),
+UINT64C(0xbf6b59aa63d22f68),
+UINT64C(0xbf67aaf8b80cff9e),
+UINT64C(0xbf641b9e71983592),
+UINT64C(0xbf60abd3f723f2b7),
+UINT64C(0xbf5ab7934169cc04),
+UINT64C(0xbf54574f6f4897d3),
+UINT64C(0xbf4c6e39da7fb99b),
+UINT64C(0xbf40ae5826a893d1),
+UINT64C(0xbf25bce8e19d48e1),
+UINT64C(0x3f253bfed65904d7),
+UINT64C(0x3f3f18ab46a04cf3),
+UINT64C(0x3f4948e7cb74278b),
+UINT64C(0x3f51427b51aeec2e),
+UINT64C(0x3f55a0716d8b4b6b),
+UINT64C(0x3f59be96aeaac56f),
+UINT64C(0x3f5d9d3ba7bfd327),
+UINT64C(0x3f609e608469e93e),
+UINT64C(0x3f624ecbcfa3832c),
+UINT64C(0x3f63e0202ae84b4d),
+UINT64C(0x3f6552a61a3f4812),
+UINT64C(0x3f66a6ae305af268),
+UINT64C(0x3f67dc910e9935bc),
+UINT64C(0x3f68f4af65036ff7),
+UINT64C(0x3f69ef71f24e7182),
+UINT64C(0x3f6acd4983da7d43),
+UINT64C(0x3f6b8eaef5b348a0),
+UINT64C(0x3f6c3423328ffb7a),
+UINT64C(0x3f6cbe2f33d33034),
+UINT64C(0x3f6d2d64018af3ac),
+UINT64C(0x3f6d825ab270c540),
+UINT64C(0x3f6dbdb46be996cc),
+UINT64C(0x3f6de01a6205cca9),
+UINT64C(0x3f6dea3dd7813daf),
+UINT64C(0x3f6ddcd81dc33335),
+UINT64C(0x3f6db8aa94de690f),
+UINT64C(0x3f6d7e7eab910d8f),
+UINT64C(0x3f6d2f25df44c187),
+UINT64C(0x3f6ccb79bc0e9844),
+UINT64C(0x3f6c545bdcaf1795),
+UINT64C(0x3f6bcab5ea9237c4),
+UINT64C(0x3f6b2f799dcf639b),
+UINT64C(0x3f6a83a0bd297862),
+UINT64C(0x3f69c82d1e0ec5de),
+UINT64C(0x3f68fe28a4990e53),
+UINT64C(0x3f6826a5438d8685),
+UINT64C(0x3f6742bcfc5cd5b2),
+UINT64C(0x3f665391df231599),
+UINT64C(0x3f655a4e0aa7d278),
+UINT64C(0x3f645823ac5e0b09),
+UINT64C(0x3f634e4d00643085),
+UINT64C(0x3f623e0c518426a3),
+UINT64C(0x3f6128abf933439a),
+UINT64C(0x3f600f7e5f92501c),
+UINT64C(0x3f5de7bbf6db0eb7),
+UINT64C(0x3f5bae5aa4792e11),
+UINT64C(0x3f5975adf0453ea2),
+UINT64C(0x3f57409b1fdc65c4),
 };
 
 int main(int argc, char* argv[])
diff --git a/include/zfp.h b/include/zfp.h
index 2706df89b..f72c5a297 100644
--- a/include/zfp.h
+++ b/include/zfp.h
@@ -72,9 +72,10 @@
 #define _zfp_str(x) _zfp_str_(x)
 
 /* library version information */
-#define ZFP_VERSION_MAJOR 0   /* library major version number */
-#define ZFP_VERSION_MINOR 5   /* library minor version number */
-#define ZFP_VERSION_RELEASE 1 /* library release version number */
+#define ZFP_VERSION_MAJOR 0 /* library major version number */
+#define ZFP_VERSION_MINOR 5 /* library minor version number */
+#define ZFP_VERSION_PATCH 2 /* library patch version number */
+#define ZFP_VERSION_RELEASE ZFP_VERSION_PATCH
 
 /* codec version number (see also zfp_codec_version) */
 #define ZFP_CODEC 5
@@ -83,13 +84,13 @@
 #define ZFP_VERSION \
   ((ZFP_VERSION_MAJOR << 8) + \
    (ZFP_VERSION_MINOR << 4) + \
-   (ZFP_VERSION_RELEASE << 0))
+   (ZFP_VERSION_PATCH << 0))
 
 /* library version string (see also zfp_version_string) */
 #define ZFP_VERSION_STRING \
   _zfp_str(ZFP_VERSION_MAJOR) "." \
   _zfp_str(ZFP_VERSION_MINOR) "." \
-  _zfp_str(ZFP_VERSION_RELEASE)
+  _zfp_str(ZFP_VERSION_PATCH)
 
 /* default compression parameters */
 #define ZFP_MIN_BITS     0 /* minimum number of bits per block */
diff --git a/include/zfp/system.h b/include/zfp/system.h
index 684346f81..539419648 100644
--- a/include/zfp/system.h
+++ b/include/zfp/system.h
@@ -34,11 +34,12 @@
 #endif
 
 #ifdef __GNUC__
-  #ifndef CACHE_LINE_SIZE
-    #define CACHE_LINE_SIZE 0x100
+  /* L1 cache line size for alignment purposes */
+  #ifndef ZFP_CACHE_LINE_SIZE
+    #define ZFP_CACHE_LINE_SIZE 0x100
   #endif
   #define align_(n) __attribute__((aligned(n)))
-  #define cache_align_(x) x align_(CACHE_LINE_SIZE)
+  #define cache_align_(x) x align_(ZFP_CACHE_LINE_SIZE)
 #else
   #define cache_align_(x) x
 #endif
diff --git a/include/zfp/types.h b/include/zfp/types.h
index 4be88a6bb..b501ca293 100644
--- a/include/zfp/types.h
+++ b/include/zfp/types.h
@@ -6,7 +6,10 @@ typedef unsigned short ushort;
 typedef unsigned int uint;
 
 #if __STDC_VERSION__ >= 199901L
+  /* C99: use standard integer types */
   #include <stdint.h>
+  #define INT64C(x) INT64_C(x)
+  #define UINT64C(x) UINT64_C(x)
   typedef int8_t int8;
   typedef uint8_t uint8;
   typedef int16_t int16;
@@ -16,15 +19,56 @@ typedef unsigned int uint;
   typedef int64_t int64;
   typedef uint64_t uint64;
 #else
-  /* assume common integer types in C89 */
+  /* C89: assume common integer types */
   typedef signed char int8;
   typedef unsigned char uint8;
   typedef signed short int16;
   typedef unsigned short uint16;
+
+  /* assume 32-bit integers (LP64, LLP64) */
   typedef signed int int32;
   typedef unsigned int uint32;
-  typedef signed long long int64; /* not ANSI C89 compliant */
-  typedef unsigned long long uint64; /* not ANSI C89 compliant */
+
+  /* determine 64-bit data model */
+  #if defined(_WIN32) || defined(_WIN64)
+    /* assume ILP32 or LLP64 (MSVC, MinGW) */
+    #define ZFP_LLP64 1
+  #else
+    /* assume LP64 (Linux, macOS, ...) */
+    #define ZFP_LP64 1
+  #endif
+
+  /* concatenation for literal suffixes */
+  #define _zfp_cat_(x, y) x ## y
+  #define _zfp_cat(x, y) _zfp_cat_(x, y)
+
+  /* signed 64-bit integers */
+  #if defined(ZFP_INT64) && defined(ZFP_INT64_SUFFIX)
+    #define INT64C(x) _zfp_cat(x, ZFP_INT64_SUFFIX)
+    typedef ZFP_INT64 int64;
+  #elif ZFP_LP64
+    #define INT64C(x) x ## l
+    typedef signed long int64;
+  #elif ZFP_LLP64
+    #define INT64C(x) x ## ll
+    typedef signed long long int64;
+  #else
+    #error "unknown 64-bit signed integer type"
+  #endif
+
+  /* unsigned 64-bit integers */
+  #if defined(ZFP_UINT64) && defined(ZFP_UINT64_SUFFIX)
+    #define UINT64C(x) _zfp_cat(x, ZFP_UINT64_SUFFIX)
+    typedef ZFP_UINT64 uint64;
+  #elif ZFP_LP64
+    #define UINT64C(x) x ## ul
+    typedef unsigned long uint64;
+  #elif ZFP_LLP64
+    #define UINT64C(x) x ## ull
+    typedef unsigned long long uint64;
+  #else
+    #error "unknown 64-bit unsigned integer type"
+  #endif
 #endif
 
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 15de13f5d..8e7decd78 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,34 +1,7 @@
-set(zfp_defs)
-
-if(NOT (ZFP_BIT_STREAM_WORD_SIZE EQUAL 64))
-  list(APPEND zfp_defs BIT_STREAM_WORD_TYPE=${ZFP_BIT_STREAM_WORD_SIZE})
-endif()
-
-if(ZFP_WITH_BIT_STREAM_STRIDED)
-  list(APPEND zfp_defs BIT_STREAM_STRIDED)
-endif()
-
-if(ZFP_WITH_ALIGNED_ALLOC)
-  list(APPEND zfp_defs ALIGNED_ALLOC)
-endif()
-
-if(ZFP_WITH_CACHE_TWOWAY)
-  list(APPEND zfp_defs CACHE_TWOWAY)
-endif()
-
-if(ZFP_WITH_CACHE_FASH_HASH)
-  list(APPEND zfp_defs CACHE_FAST_HASH)
-endif()
-
-if(ZFP_WITH_CACHE_PROFILING)
-  list(APPEND zfp_defs CACHE_PROFILING)
-endif()
-
 set(zfp_source
   zfp.c
   bitstream.c
   traitsf.h traitsd.h block1.h block2.h block3.h
-
   encode1f.c encode1d.c encode1i.c encode1l.c
   decode1f.c decode1d.c decode1i.c decode1l.c
   encode2f.c encode2d.c encode2i.c encode2l.c
diff --git a/src/inline/bitstream.c b/src/inline/bitstream.c
index e4e6f48eb..fd7d95c17 100644
--- a/src/inline/bitstream.c
+++ b/src/inline/bitstream.c
@@ -127,7 +127,7 @@ struct bitstream {
   word* begin; /* beginning of stream */
   word* end;   /* end of stream (currently unused) */
 #ifdef BIT_STREAM_STRIDED
-  size_t mask;     /* one less the block size in number of words  */
+  size_t mask;     /* one less the block size in number of words */
   ptrdiff_t delta; /* number of words between consecutive blocks */
 #endif
 };
@@ -270,7 +270,7 @@ inline_ uint64
 stream_write_bits(bitstream* s, uint64 value, uint n)
 {
   /* append bit string to buffer */
-  s->buffer += value << s->bits;
+  s->buffer += (word)(value << s->bits);
   s->bits += n;
   /* is buffer full? */
   if (s->bits >= wsize) {
@@ -284,7 +284,7 @@ stream_write_bits(bitstream* s, uint64 value, uint n)
       /* assert: 0 <= s->bits <= n */
       stream_write_word(s, s->buffer);
       /* assert: 0 <= n - s->bits < 64 */
-      s->buffer = value >> (n - s->bits);
+      s->buffer = (word)(value >> (n - s->bits));
     } while (sizeof(s->buffer) < sizeof(value) && s->bits >= wsize);
   }
   /* assert: 0 <= s->bits < wsize */
@@ -371,7 +371,7 @@ stream_pad(bitstream* s, uint n)
 inline_ size_t
 stream_align(bitstream* s)
 {
-  size_t bits = s->bits;
+  uint bits = s->bits;
   if (bits)
     stream_skip(s, bits);
   return bits;
@@ -381,7 +381,7 @@ stream_align(bitstream* s)
 inline_ size_t
 stream_flush(bitstream* s)
 {
-  size_t bits = (wsize - s->bits) % wsize;
+  uint bits = (wsize - s->bits) % wsize;
   if (bits)
     stream_pad(s, bits);
   return bits;
diff --git a/src/traitsd.h b/src/traitsd.h
index a9767ef1d..cc612b493 100644
--- a/src/traitsd.h
+++ b/src/traitsd.h
@@ -1,10 +1,10 @@
 /* double-precision floating-point traits */
 
-#define Scalar double                /* floating-point type */
-#define Int int64                    /* corresponding signed integer type */
-#define UInt uint64                  /* corresponding unsigned integer type */
-#define EBITS 11                     /* number of exponent bits */
-#define NBMASK 0xaaaaaaaaaaaaaaaaull /* negabinary mask */
+#define Scalar double                      /* floating-point type */
+#define Int int64                          /* corresponding signed integer type */
+#define UInt uint64                        /* corresponding unsigned integer type */
+#define EBITS 11                           /* number of exponent bits */
+#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */
 
 #define FABS(x) fabs(x)
 #define FREXP(x, e) frexp(x, e)
diff --git a/src/traitsl.h b/src/traitsl.h
index 153e3ac5a..c4c853467 100644
--- a/src/traitsl.h
+++ b/src/traitsl.h
@@ -1,6 +1,6 @@
 /* 64-bit integer traits */
 
-#define Scalar int64                 /* integer type */
-#define Int int64                    /* corresponding signed integer type */
-#define UInt uint64                  /* corresponding unsigned integer type */
-#define NBMASK 0xaaaaaaaaaaaaaaaaull /* negabinary mask */
+#define Scalar int64                       /* integer type */
+#define Int int64                          /* corresponding signed integer type */
+#define UInt uint64                        /* corresponding unsigned integer type */
+#define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */
diff --git a/src/zfp.c b/src/zfp.c
index dc0a12dc3..2bf2f280a 100644
--- a/src/zfp.c
+++ b/src/zfp.c
@@ -32,7 +32,7 @@
 
 export_ const uint zfp_codec_version = ZFP_CODEC;
 export_ const uint zfp_library_version = ZFP_VERSION;
-export_ const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (March 28, 2017)";
+export_ const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (September 28, 2017)";
 
 /* private functions ------------------------------------------------------- */
 
@@ -296,16 +296,16 @@ zfp_field_set_metadata(zfp_field* field, uint64 meta)
   switch (dims) {
     case 1:
       /* currently dimensions are limited to 2^32 - 1 */
-      field->nx = (meta & 0x0000ffffffffull) + 1; meta >>= 48;
+      field->nx = (meta & UINT64C(0x0000ffffffff)) + 1; meta >>= 48;
       break;
     case 2:
-      field->nx = (meta & 0xffffffull) + 1; meta >>= 24;
-      field->ny = (meta & 0xffffffull) + 1; meta >>= 24;
+      field->nx = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
+      field->ny = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
       break;
     case 3:
-      field->nx = (meta & 0xffffull) + 1; meta >>= 16;
-      field->ny = (meta & 0xffffull) + 1; meta >>= 16;
-      field->nz = (meta & 0xffffull) + 1; meta >>= 16;
+      field->nx = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->ny = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->nz = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
       break;
   }
   field->sx = field->sy = field->sz = 0;
@@ -457,7 +457,7 @@ zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, int
   }
   if (wra) {
     /* for write random access, round up to next multiple of stream word size */
-    bits += stream_word_bits - 1;
+    bits += (uint)stream_word_bits - 1;
     bits &= ~(stream_word_bits - 1);
   }
   zfp->minbits = bits;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 474e26184..954c58c09 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_executable(testzfp testzfp.cpp fields.c)
 target_link_libraries(testzfp zfp)
+target_compile_definitions(testzfp PRIVATE ${zfp_defs})
 
 option(ZFP_BUILD_TESTING_SMALL "Enable small-sized array testing" ON)
 if(ZFP_BUILD_TESTING_SMALL)
diff --git a/tests/testzfp.cpp b/tests/testzfp.cpp
index 2eae6bbe5..aee261e4e 100644
--- a/tests/testzfp.cpp
+++ b/tests/testzfp.cpp
@@ -105,7 +105,7 @@ inline uint
 test_rate(zfp_stream* stream, const zfp_field* input, double rate, Scalar tolerance, bool timings = false)
 {
   uint failures = 0;
-  uint n = zfp_field_size(input, NULL);
+  size_t n = zfp_field_size(input, NULL);
   uint dims = zfp_field_dimensionality(input);
   zfp_type type = zfp_field_type(input);
 
@@ -188,7 +188,7 @@ inline uint
 test_precision(zfp_stream* stream, const zfp_field* input, uint precision, size_t bytes)
 {
   uint failures = 0;
-  uint n = zfp_field_size(input, NULL);
+  size_t n = zfp_field_size(input, NULL);
 
   // allocate memory for compressed data
   zfp_stream_set_precision(stream, precision);
@@ -244,7 +244,7 @@ inline uint
 test_accuracy(zfp_stream* stream, const zfp_field* input, Scalar tolerance, size_t bytes)
 {
   uint failures = 0;
-  uint n = zfp_field_size(input, NULL);
+  size_t n = zfp_field_size(input, NULL);
 
   // allocate memory for compressed data
   tolerance = static_cast<Scalar>(zfp_stream_set_accuracy(stream, tolerance));
@@ -714,6 +714,23 @@ common_tests()
     std::cout << "library header and binary version mismatch" << std::endl;
     failures++;
   }
+  // ensure integer type sizes are correct
+  if (sizeof(int8) != 1u || sizeof(uint8) != 1u) {
+    std::cout << "8-bit integer type is not one byte wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int16) != 2u || sizeof(uint16) != 2u) {
+    std::cout << "16-bit integer type is not two bytes wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int32) != 4u || sizeof(uint32) != 4u) {
+    std::cout << "32-bit integer type is not four bytes wide" << std::endl;
+    failures++;
+  }
+  if (sizeof(int64) != 8u || sizeof(uint64) != 8u) {
+    std::cout << "64-bit integer type is not eight bytes wide" << std::endl;
+    failures++;
+  }
   // ensure signed right shifts are arithmetic
   int32 x32 = -2;
   if ((x32 >> 1) != -1 || (x32 >> 2) != -1) {
@@ -721,7 +738,7 @@ common_tests()
     failures++;
   }
   int64 x64 = -2;
-  if ((x64 >> 1) != -1ll || (x64 >> 2) != -1ll) {
+  if ((x64 >> 1) != INT64C(-1) || (x64 >> 2) != INT64C(-1)) {
     std::cout << "64-bit arithmetic right shift not supported" << std::endl;
     failures++;
   }
@@ -738,6 +755,32 @@ int main(int argc, char* argv[])
   std::cout << zfp_version_string << std::endl;
   std::cout << "library version " << zfp_library_version << std::endl;
   std::cout << "CODEC version " << zfp_codec_version << std::endl;
+  std::cout << "data model ";
+  size_t model = ((sizeof(uint64) - 1) << 12) +
+                 ((sizeof(void*) - 1) << 8) +
+                 ((sizeof(unsigned long int) - 1) << 4) +
+                 ((sizeof(unsigned int) - 1) << 0);
+  switch (model) {
+    case 0x7331u:
+      std::cout << "LP32";
+      break;
+    case 0x7333u:
+      std::cout << "ILP32";
+      break;
+    case 0x7733u:
+      std::cout << "LLP64";
+      break;
+    case 0x7773u:
+      std::cout << "LP64";
+      break;
+    case 0x7777u:
+      std::cout << "ILP64";
+      break;
+    default:
+      std::cout << "unknown (0x" << std::hex << model << ")";
+      break;
+  }
+  std::cout << std::endl;
   std::cout << std::endl;
 
   uint sizes = 0;
diff --git a/travis.sh b/travis.sh
index a684b8b4b..38937fee7 100755
--- a/travis.sh
+++ b/travis.sh
@@ -3,7 +3,7 @@ set -e
 
 mkdir build
 cd build
-cmake ..
+cmake .. -DCMAKE_C_STANDARD=${C_STANDARD:-99} -DCMAKE_CXX_STANDARD=${CXX_STANDARD:-98}
 cmake --build .
 ctest -V -C "Debug"