From 24c73b2e7473888782e01b0e7008ff4379b0fc45 Mon Sep 17 00:00:00 2001
From: Manodeep Sinha <manodeep@gmail.com>
Date: Sat, 18 Aug 2018 06:23:10 +1000
Subject: [PATCH] ddsmu (#166)

* DD(s,mu) function for mocks/theory (#130)

* Updated README.rst [ci skip]

Weights are on `pip`. Changed repo files to be links.

* add a DDsmu mocks function

* remove extra slash

* add tests of DDsmu_mocks

* add name to authors

* update to add a mu_max function parameter

* bug fixes; verified output against kdcount for different mu_max for AVX, SSE42, and fallback

* adding theory DDsmu; verified for different mu_max and all ISA against kdcount

* update docs

* include DDsmu in theory/tests

* fix type error

* forgot to remove other variable definition

* Updating the docs for (theory) DDsmu

* Reviewed the theory functions (still need comprehensive tests and update RTD docs)

* Adding the new file, tests_common.h, to allow integration tests (exhaustive tests for new pair-counters).

* My (broken) mocks code

* Fixing bugs uncovered by doctests (which are still not failing the build)

* Trying to solve the doctests failures and the warnings raised during compiling the docs for DDsmu

* I have a suspicion that doctests are not failing the build because they are in the 'after_success' part. Moved the doctests into the tests section. Might solve #143

* Attempting to fix #144

* Fixed the Makefile for DDsmu tests

* Added the tests for DDsmu_mocks into the Makefile

* Whitespace changes only for better readability [ci skip]

* Corrected the variable type for nmu_bins and some small changes for better code readability

* The output file for DDsmu_mocks.DD really corresponds to DDsmu_mocks.RR (see #132)

* Fixed the DDsmu_mocks tests

* Changed the name of the DDsmu_mocks test from DD->RR. Put the name of each test on a new line

* Attempting to fix  travis failure (from doctest failure)

* Another attempt at fixing the doctest failure on travis

* Next attempt at fixing doctest failure

* Small change to the auto-generated docs [ci skip]

* Doctests are failing because numpy does not honour set_printoptions for structured arrays (numpy issue #5606). This numpy issue seems to have been solved in 1.12. Bumping the default travis numpy version to 1.12

* Still trying to fix doctest failures. Now removed testing for python3.3 and added python3.6

* Missed the 'then' in the if condition. Added a xcode9 image for osx tests

* Added a python3.6 for osx and changed the python version to python2.7 for xcode6 and xcode7

* Corrected the miniconda installer filenames for python2

* Added the numpy version=1.7 for testing the minimum requirements on osx

* Added C mode declaration for syntax highlighting [ci skip]

* Made sure that mu_max is specified before nmu_bins. Changed the ordering in the python extension as well

* Added example C codes for the DDsmu and DDsmu_mocks pair-counters

* The case of a mis-placed dot (or how to break the build)

* Enforce that mu_max is scalar and greater than 0

* pimax is not required for DDsmu_mocks. Correctly added the parx/pary/parz components into the pair-weight struct for DDsmu_mocks and DDrppi_mocks. Renamed variables to make context clearer (will need to be done for DDrppi_mocks as well)

* Renamed sqr_sep to sqr_s and removed checks for pimax

* Changed the kernel parameters to smax/smin from sqr_smax/sqr_smin

* The AVX tests pass now for DDsmu_mocks

* Fixed the INTEGRATION_TEST section for DDtheta_mocks

* Updated docstrings in python bindings for DDsmu and DDsmu_mocks

* Added docs for DDsmu and DDsmu_mocks. Fixed the docstring formatting (removed notes within function docstrings)

* Added the missing variable for doctests

* Renamed w(theta) to DD(theta) and changed some text formatting

* I forgot to fix the DDsmu_mocks file for the doctest failure

* DDsmu PR is now ready to be merged. Bumping version to 2.1

* README updated to show that github pages are no longer being published [ci skip]

* Filled in some more missing docs/docstrings

* Remove further references to github pages site [ci skip]

* Adding in the fast_divide option to theory/DDsmu paircounter. Not tested

* Fixing the typos in fast-divide part of DDsmu. Added in other changes as well -- oops

* Added in the fast_divide option into the main python wrappers. Fixed build failure

* Added entries for the upcoming versions and features [ci skip]

* Hopefully fixing build failure

* Attempting to fix warning during building docs

* Add PR # to changelog
---
 .travis.yml                                   |   7 -
 CHANGES.rst                                   |  11 +-
 Corrfunc/mocks/DDrppi_mocks.py                |  19 +-
 Corrfunc/mocks/DDsmu_mocks.py                 |  28 +-
 Corrfunc/theory/DDsmu.py                      |  18 +-
 common.mk                                     | 241 +++++++++---------
 docs/source/conf.py                           |   3 +-
 mocks.options                                 |   4 +-
 .../countpairs_rp_pi_mocks_impl.c.src         |  10 +-
 .../countpairs_rp_pi_mocks_impl.h.src         |   2 +-
 .../countpairs_rp_pi_mocks_kernels.c.src      |  84 +++---
 .../countpairs_s_mu_mocks_impl.c.src          |  10 +-
 mocks/python_bindings/_countpairs_mocks.c     |  34 +--
 mocks/tests/tests_mocks.c                     |   3 +-
 rules.mk                                      |   2 +-
 theory.options                                |   3 +-
 theory/DDsmu/Makefile                         |   9 +-
 theory/DDsmu/countpairs_s_mu_impl.c.src       |  20 +-
 theory/DDsmu/countpairs_s_mu_impl.h.src       |   4 +-
 theory/DDsmu/countpairs_s_mu_kernels.c.src    |  29 ++-
 theory/python_bindings/_countpairs.c          |  19 +-
 theory/tests/Makefile                         |   5 +-
 theory/tests/test_nonperiodic.c               |   3 +-
 theory/tests/test_periodic.c                  |   2 +-
 utils/Makefile                                |   4 +-
 utils/avx_calls.h                             |  65 ++++-
 utils/defs.h                                  |  20 +-
 27 files changed, 396 insertions(+), 263 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ee991793..4b5f61ea 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,7 +43,6 @@ matrix:
     #     - brew outdated xctool || brew upgrade xctool
     #     - brew tap homebrew/versions && brew install clang-omp
     #     - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh
-
     - os: osx
       osx_image: xcode9
       compiler: clang
@@ -66,12 +65,6 @@ matrix:
       before_install:
         - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
 
-    # - os: osx
-    #   osx_image: xcode6.4
-    #   compiler: clang
-    #   env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=2.6 NUMPY_VERSION=1.7 DOCTEST=FALSE
-    #   before_install:
-    #     - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
 
     # - os: osx
     #   compiler: gcc
diff --git a/CHANGES.rst b/CHANGES.rst
index cf9f96d4..7e08ec6c 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -7,16 +7,25 @@
 
 New features
 ------------
-- New pair counter `DD(s, mu)` for theory and mocks
 - conda installable package
+- GPU version
 
 
 2.1.0
 =======
 
+New features
+------------
+- New pair counter `DD(s, mu)` for theory and mocks (contributed by @nickhand,
+  in #130 and #132) [#166]
+
+
 Enhancements
 ------------
 - GSL version now specified and tested by Travis [#164]
+- Now possible to specify the number of Newton-Raphson steps to
+improve accuracy of approximate reciprocals. Available in `DD(rp, pi)` for mocks,
+and `DD(s, mu)` for both theory and mocks
 
 
 2.0.0
diff --git a/Corrfunc/mocks/DDrppi_mocks.py b/Corrfunc/mocks/DDrppi_mocks.py
index e98a8be6..dd52fa6e 100644
--- a/Corrfunc/mocks/DDrppi_mocks.py
+++ b/Corrfunc/mocks/DDrppi_mocks.py
@@ -19,9 +19,9 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
                  RA2=None, DEC2=None, CZ2=None, weights2=None,
                  is_comoving_dist=False,
                  verbose=False, output_rpavg=False,
-                 fast_divide=False, xbin_refine_factor=2,
-                 ybin_refine_factor=2, zbin_refine_factor=1,
-                 max_cells_per_dim=100,
+                 fast_divide_and_NR_steps=0,
+                 xbin_refine_factor=2, ybin_refine_factor=2,
+                 zbin_refine_factor=1, max_cells_per_dim=100,
                  c_api_timer=False, isa=r'fastest', weight_type=None):
     """
     Calculate the 2-D pair-counts corresponding to the projected correlation
@@ -169,12 +169,13 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
         suffer from numerical loss of precision and can not be trusted. If 
         you need accurate ``rpavg`` values, then pass in double precision 
         arrays for the particle positions.
-    
-    fast_divide : boolean (default false)
-        Boolean flag to replace the division in ``AVX`` implementation with an
-        approximate reciprocal, followed by two Newton-Raphson steps. Improves
-        runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.
 
+    fast_divide_and_NR_steps: integer (default 0)
+        Replaces the division in ``AVX`` implementation with an approximate
+        reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson.
+        Can improve runtime by ~15-20% on older computers. Value of 0 uses
+        the standard division operation.
+    
     (xyz)bin_refine_factor : integer, default is (2,2,1); typically within [1-3]
         Controls the refinement on the cell sizes. Can have up to a 20% impact
         on runtime.
@@ -366,7 +367,7 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile,
                                          is_comoving_dist=is_comoving_dist,
                                          verbose=verbose,
                                          output_rpavg=output_rpavg,
-                                         fast_divide=fast_divide,
+                                         fast_divide_and_NR_steps=fast_divide_and_NR_steps,
                                          xbin_refine_factor=xbin_refine_factor,
                                          ybin_refine_factor=ybin_refine_factor,
                                          zbin_refine_factor=zbin_refine_factor,
diff --git a/Corrfunc/mocks/DDsmu_mocks.py b/Corrfunc/mocks/DDsmu_mocks.py
index f4da2616..16aa2def 100755
--- a/Corrfunc/mocks/DDsmu_mocks.py
+++ b/Corrfunc/mocks/DDsmu_mocks.py
@@ -18,9 +18,9 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,
                 RA2=None, DEC2=None, CZ2=None, weights2=None,
                 is_comoving_dist=False,
                 verbose=False, output_savg=False,
-                fast_divide=False, xbin_refine_factor=2,
-                ybin_refine_factor=2, zbin_refine_factor=1,
-                max_cells_per_dim=100,
+                fast_divide_and_NR_steps=0,
+                xbin_refine_factor=2, ybin_refine_factor=2,
+                zbin_refine_factor=1, max_cells_per_dim=100,
                 c_api_timer=False, isa='fastest', weight_type=None):
     """
     Calculate the 2-D pair-counts corresponding to the projected correlation
@@ -121,10 +121,11 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,
         co-moving distance, rather than `cz`.
 
     weights1: array_like, real (float/double), optional
-        A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,).
-        `weight_type` specifies how these weights are used; results are returned
-        in the `weightavg` field.  If only one of weights1 and weights2 is
-        specified, the other will be set to uniform weights.
+        A scalar, or an array of weights of shape (n_weights, n_positions)
+        or (n_positions,). `weight_type` specifies how these weights are used;
+        results are returned in the `weightavg` field.  If only one of
+        ``weights1`` or ``weights2`` is specified, the other will be set
+        to uniform weights.
 
     RA2: array-like, real (float/double)
         The array of Right Ascensions for the second set of points. RA's
@@ -171,11 +172,12 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,
         values, then pass in double precision arrays for the particle
         positions.
 
-    fast_divide: boolean (default false)
-        Boolean flag to replace the division in ``AVX`` implementation with an
-        approximate reciprocal, followed by a Newton-Raphson step. Improves
-        runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.
-
+    fast_divide_and_NR_steps: integer (default 0)
+        Replaces the division in ``AVX`` implementation with an approximate
+        reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson.
+        Can improve runtime by ~15-20% on older computers. Value of 0 uses
+        the standard division operation.
+    
     (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3]
         Controls the refinement on the cell sizes. Can have up to a 20% impact
         on runtime.
@@ -290,7 +292,7 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile,
                                         is_comoving_dist=is_comoving_dist,
                                         verbose=verbose,
                                         output_savg=output_savg,
-                                        fast_divide=fast_divide,
+                                        fast_divide_and_NR_steps=fast_divide_and_NR_steps,
                                         xbin_refine_factor=xbin_refine_factor,
                                         ybin_refine_factor=ybin_refine_factor,
                                         zbin_refine_factor=zbin_refine_factor,
diff --git a/Corrfunc/theory/DDsmu.py b/Corrfunc/theory/DDsmu.py
index b8478f80..17659739 100644
--- a/Corrfunc/theory/DDsmu.py
+++ b/Corrfunc/theory/DDsmu.py
@@ -14,11 +14,12 @@
 
 
 def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None,
-           periodic=True, X2=None, Y2=None, Z2=None, weights2=None,
-           verbose=False, boxsize=0.0, output_savg=False,
-           xbin_refine_factor=2, ybin_refine_factor=2,
-           zbin_refine_factor=1, max_cells_per_dim=100,
-           c_api_timer=False, isa=r'fastest', weight_type=None):
+          periodic=True, X2=None, Y2=None, Z2=None, weights2=None,
+          verbose=False, boxsize=0.0, output_savg=False,
+          fast_divide_and_NR_steps=0,
+          xbin_refine_factor=2, ybin_refine_factor=2,
+          zbin_refine_factor=1, max_cells_per_dim=100,
+          c_api_timer=False, isa=r'fastest', weight_type=None):
     """
     Calculate the 2-D pair-counts corresponding to the redshift-space 
     correlation function, :math:`\\xi(s, \mu)` Pairs which are separated
@@ -111,6 +112,12 @@ def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=No
         precision and can not be trusted. If you need accurate ``s``
         values, then pass in double precision arrays for the particle positions.
 
+    fast_divide_and_NR_steps: integer (default 0)
+        Replaces the division in ``AVX`` implementation with an approximate
+        reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson.
+        Can improve runtime by ~15-20% on older computers. Value of 0 uses
+        the standard division operation.
+    
     (xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3])
         Controls the refinement on the cell sizes. Can have up to a 20% impact
         on runtime.
@@ -283,6 +290,7 @@ def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=No
                                         verbose=verbose,
                                         boxsize=boxsize,
                                         output_savg=output_savg,
+                                        fast_divide_and_NR_steps=fast_divide_and_NR_steps,
                                         xbin_refine_factor=xbin_refine_factor,
                                         ybin_refine_factor=ybin_refine_factor,
                                         zbin_refine_factor=zbin_refine_factor,
diff --git a/common.mk b/common.mk
index 455e2187..b2da73f2 100644
--- a/common.mk
+++ b/common.mk
@@ -245,17 +245,11 @@ ifeq ($(DO_CHECKS), 1)
       $(error $(ccred) DOUBLE_PREC must be enabled with OUTPUT_THETAAVG -- loss of precision will give you incorrect results for the outer bins (>=20-30 million pairs) $(ccreset))
     endif
   endif
-
-  # ifeq (FAST_DIVIDE,$(findstring FAST_DIVIDE,$(OPT)))
-  #   ifneq (USE_AVX,$(findstring USE_AVX,$(OPT)))
-  #     $(warning Makefile option $(ccblue)"FAST_DIVIDE"$(ccreset) will not do anything unless $(ccblue)USE_AVX$(ccreset) is set)
-  #   endif
-  # endif
   ## done with check for conflicting options
 
   ifeq (icc,$(findstring icc,$(CC)))
     CFLAGS += -xhost -opt-prefetch -opt-prefetch-distance=16 #-vec-report6
-	  ifeq (USE_OMP,$(findstring USE_OMP,$(OPT)))
+    ifeq (USE_OMP,$(findstring USE_OMP,$(OPT)))
       CFLAGS += -openmp
       CLINK  += -openmp
     endif ##openmp with icc
@@ -376,123 +370,140 @@ ifeq ($(DO_CHECKS), 1)
   # All of the python/numpy checks follow
   export PYTHON_CHECKED ?= 0
   export NUMPY_CHECKED ?= 0
+  export COMPILE_PYTHON_EXT ?= 0
   ifeq ($(PYTHON_CHECKED), 0)
-    export COMPILE_PYTHON_EXT := 1
-    export PYTHON_VERSION_FULL := $(wordlist 2,4,$(subst ., ,$(shell $(PYTHON) --version 2>&1)))
-    export PYTHON_VERSION_MAJOR := $(word 1,${PYTHON_VERSION_FULL})
-    export PYTHON_VERSION_MINOR := $(word 2,${PYTHON_VERSION_FULL})
-
-    ## I only need this so that I can print out the full python version (correctly)
-    ## in case of error
-    PYTHON_VERSION_PATCH := $(word 3,${PYTHON_VERSION_FULL})
-
-    ## Check numpy version
-    export NUMPY_VERSION_FULL :=  $(wordlist 1,3,$(subst ., ,$(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print(numpy.__version__)")))
-    export NUMPY_VERSION_MAJOR := $(word 1,${NUMPY_VERSION_FULL})
-    export NUMPY_VERSION_MINOR := $(word 2,${NUMPY_VERSION_FULL})
-
-    ## Same reason as python patch level.
-    NUMPY_VERSION_PATCH := $(word 3,${NUMPY_VERSION_FULL})
-
-    ### Check for minimum python + numpy versions. In theory, I should also check
-    ### that *any* python and numpy are available but that seems too much effort
-    MIN_PYTHON_MAJOR := 2
-    MIN_PYTHON_MINOR := 6
-
-    MIN_NUMPY_MAJOR  := 1
-    MIN_NUMPY_MINOR  := 7
-
-    PYTHON_AVAIL := $(shell [ $(PYTHON_VERSION_MAJOR) -gt $(MIN_PYTHON_MAJOR) -o \( $(PYTHON_VERSION_MAJOR) -eq $(MIN_PYTHON_MAJOR) -a $(PYTHON_VERSION_MINOR) -ge $(MIN_PYTHON_MINOR) \) ] && echo true)
-    NUMPY_AVAIL  := $(shell [ $(NUMPY_VERSION_MAJOR) -gt $(MIN_NUMPY_MAJOR) -o \( $(NUMPY_VERSION_MAJOR) -eq $(MIN_NUMPY_MAJOR) -a $(NUMPY_VERSION_MINOR) -ge $(MIN_NUMPY_MINOR) \) ] && echo true)
+    # This is very strange -- requested 'version' info goes to stderr!!
+    # anything user-requested should always go to stdout IMHO -- MS 17/8/2018
+    # Only stdout is passed back as the output; therefore need to redirect
+    # stderr to stdout, and then capture that output to `PYTHON_FOUND`
+    PYTHON_FOUND := $(shell $(PYTHON) --version 2>&1))
+    PYTHON_CHECKED := 1
+    ifdef PYTHON_FOUND
+      export PYTHON_VERSION_FULL := $(wordlist 2,4,$(subst ., ,${PYTHON_FOUND}))
+      export PYTHON_VERSION_MAJOR := $(word 1,${PYTHON_VERSION_FULL})
+      export PYTHON_VERSION_MINOR := $(word 2,${PYTHON_VERSION_FULL})
+
+      ## I only need this so that I can print out the full python version (correctly)
+      ## in case of error
+      PYTHON_VERSION_PATCH := $(word 3,${PYTHON_VERSION_FULL})
+
+      ## Check numpy version
+      export NUMPY_VERSION_FULL :=  $(wordlist 1,3,$(subst ., ,$(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print(numpy.__version__)")))
+      export NUMPY_VERSION_MAJOR := $(word 1,${NUMPY_VERSION_FULL})
+      export NUMPY_VERSION_MINOR := $(word 2,${NUMPY_VERSION_FULL})
+
+      ## Same reason as python patch level.
+      NUMPY_VERSION_PATCH := $(word 3,${NUMPY_VERSION_FULL})
+
+      ### Check for minimum python + numpy versions. In theory, I should also check
+      ### that *any* python and numpy are available but that seems too much effort
+      MIN_PYTHON_MAJOR := 2
+      MIN_PYTHON_MINOR := 6
+
+      MIN_NUMPY_MAJOR  := 1
+      MIN_NUMPY_MINOR  := 7
+
+      PYTHON_AVAIL := $(shell [ $(PYTHON_VERSION_MAJOR) -gt $(MIN_PYTHON_MAJOR) -o \( $(PYTHON_VERSION_MAJOR) -eq $(MIN_PYTHON_MAJOR) -a $(PYTHON_VERSION_MINOR) -ge $(MIN_PYTHON_MINOR) \) ] && echo true)
+      NUMPY_AVAIL  := $(shell [ $(NUMPY_VERSION_MAJOR) -gt $(MIN_NUMPY_MAJOR) -o \( $(NUMPY_VERSION_MAJOR) -eq $(MIN_NUMPY_MAJOR) -a $(NUMPY_VERSION_MINOR) -ge $(MIN_NUMPY_MINOR) \) ] && echo true)
+
+      ifeq ($(PYTHON_AVAIL),true)
+        ifeq ($(NUMPY_AVAIL),true)
+          export COMPILE_PYTHON_EXT := 1
+        endif
+      endif
 
-    ifneq ($(PYTHON_AVAIL),true)
-      $(warning $(ccmagenta) Found python version $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR).$(PYTHON_VERSION_PATCH) but minimum required python is $(MIN_PYTHON_MAJOR).$(MIN_PYTHON_MINOR) $(ccreset))
-      COMPILE_PYTHON_EXT := 0
-    endif
+      ifneq ($(PYTHON_AVAIL),true)
+        $(warning $(ccmagenta) Found python version $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR).$(PYTHON_VERSION_PATCH) but minimum required python is $(MIN_PYTHON_MAJOR).$(MIN_PYTHON_MINOR) $(ccreset))
+        export COMPILE_PYTHON_EXT := 0
+      endif
 
-    ifneq ($(NUMPY_AVAIL),true)
-      $(warning $(ccmagenta) Found NUMPY version $(NUMPY_VERSION_MAJOR).$(NUMPY_VERSION_MINOR).$(NUMPY_VERSION_PATCH) but minimum required numpy is $(MIN_NUMPY_MAJOR).$(MIN_NUMPY_MINOR) $(ccreset))
-      COMPILE_PYTHON_EXT := 0
-    endif
+      ifneq ($(NUMPY_AVAIL),true)
+        $(warning $(ccmagenta) Found NUMPY version $(NUMPY_VERSION_MAJOR).$(NUMPY_VERSION_MINOR).$(NUMPY_VERSION_PATCH) but minimum required numpy is $(MIN_NUMPY_MAJOR).$(MIN_NUMPY_MINOR) $(ccreset))
+        export COMPILE_PYTHON_EXT := 0
+      endif
 
-    ifneq ($(COMPILE_PYTHON_EXT), 0)
-      ifndef PYTHON_CONFIG_EXE
-        ifeq ($(PYTHON_VERSION_MAJOR), 2)
-          PYTHON_CONFIG_EXE:=python-config
-        else
-          PYTHON_CONFIG_EXE:=python3-config
+      ifneq ($(COMPILE_PYTHON_EXT), 0)
+        ifndef PYTHON_CONFIG_EXE
+          ifeq ($(PYTHON_VERSION_MAJOR), 2)
+            PYTHON_CONFIG_EXE:=python-config
+          else
+            PYTHON_CONFIG_EXE:=python3-config
+          endif
+          ifneq ($(PYTHON), python)
+            PYTHON_CONFIG_EXE:=$(dir $(PYTHON))$(PYTHON_CONFIG_EXE)
+            $(warning $(ccblue)"PYTHON"$(ccreset) is set to $(ccblue)$(PYTHON)$(ccreset); using $(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset) as $(ccblue)python-config$(ccreset). If this is not correct, please also set $(ccblue)"PYTHON_CONFIG_EXE"$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset))
+          endif
         endif
-        ifneq ($(PYTHON), python)
-          PYTHON_CONFIG_EXE:=$(dir $(PYTHON))$(PYTHON_CONFIG_EXE)
-          $(warning $(ccblue)"PYTHON"$(ccreset) is set to $(ccblue)$(PYTHON)$(ccreset); using $(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset) as $(ccblue)python-config$(ccreset). If this is not correct, please also set $(ccblue)"PYTHON_CONFIG_EXE"$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset))
+        PYTHON_CONFIG_INCL := $(shell $(PYTHON_CONFIG_EXE) --includes 2>/dev/null)
+        ifndef PYTHON_CONFIG_INCL
+          $(error $(ccred)python-config$(ccreset) ($(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset)) not found. Please set $(ccgreen)PYTHON_CONFIG_EXE$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset) before installing $(DISTNAME).$(VERSION). Installing $(ccblue)python-devel$(ccreset) might fix this issue $(ccreset))
         endif
-      endif
-      PYTHON_CONFIG_INCL := $(shell $(PYTHON_CONFIG_EXE) --includes 2>/dev/null)
-      ifndef PYTHON_CONFIG_INCL
-        $(error $(ccred)python-config$(ccreset) ($(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset)) not found. Please set $(ccgreen)PYTHON_CONFIG_EXE$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset) before installing $(DISTNAME).$(VERSION). Installing $(ccblue)python-devel$(ccreset) might fix this issue $(ccreset))
-      endif
-      PYTHON_CONFIG_INCL:=$(patsubst -I%,-isystem%, $(PYTHON_CONFIG_INCL))
-
-      # NUMPY is available -> next step should not fail
-      # That's why we are not checking if the NUMPY_INCL_FLAG is defined.
-      ifeq ($(NUMPY_CHECKED), 0)
-        export NUMPY_INCL_FLAG := $(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print('-isystem ' + numpy.__path__[0] + '/core/include/numpy/')")
-        # Take the second word -> the path (the first word is "isystem")
-        NUMPY_INCL_PATH := $(word 2, ${NUMPY_INCL_FLAG})
-        # Now check that the 'arrayobject.h' file is present in the
-        # supposed numpy directory. Otherwise, compilation will fail.
-        # The absence of the file likely indicates a missing numpy-devel
-        # package (see issue #134 on github)
-        NUMPY_NEEDED_HEADER_FILE := ${NUMPY_INCL_PATH}arrayobject.h
-        ifeq (,$(wildcard ${NUMPY_NEEDED_HEADER_FILE}))
-          $(error Required $(ccred)numpy headers$(ccreset) are missing...stopping the compilation. You might be able to fix this by installing $(ccblue)numpy-devel$(ccreset))
+        PYTHON_CONFIG_INCL:=$(patsubst -I%,-isystem%, $(PYTHON_CONFIG_INCL))
+
+        # NUMPY is available -> next step should not fail
+        # That's why we are not checking if the NUMPY_INCL_FLAG is defined.
+        ifeq ($(NUMPY_CHECKED), 0)
+          export NUMPY_INCL_FLAG := $(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print('-isystem ' + numpy.__path__[0] + '/core/include/numpy/')")
+          # Take the second word -> the path (the first word is "isystem")
+          NUMPY_INCL_PATH := $(word 2, ${NUMPY_INCL_FLAG})
+          # Now check that the 'arrayobject.h' file is present in the
+          # supposed numpy directory. Otherwise, compilation will fail.
+          # The absence of the file likely indicates a missing numpy-devel
+          # package (see issue #134 on github)
+          NUMPY_NEEDED_HEADER_FILE := ${NUMPY_INCL_PATH}arrayobject.h
+          ifeq (,$(wildcard ${NUMPY_NEEDED_HEADER_FILE}))
+            $(error Required $(ccred)numpy headers$(ccreset) are missing...stopping the compilation. You might be able to fix this by installing $(ccblue)numpy-devel$(ccreset))
+          endif
+          export NUMPY_CHECKED:=1
         endif
-        export NUMPY_CHECKED:=1
-      endif
 
-      export PYTHON_CFLAGS := $(PYTHON_CONFIG_INCL) $(NUMPY_INCL_FLAG)
-      export PYTHON_LIBDIR := $(shell $(PYTHON_CONFIG_EXE) --prefix)/lib
-      export PYTHON_LIBS   := $(shell $(PYTHON_CONFIG_EXE) --libs)
-      export PYTHON_LINK :=
-      # export PYTHON_LINK   := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR)
-      # export PYTHON_LINK   := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR)
-      SOABI := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; print(sysconfig.get_config_var('SOABI'))" 2>/dev/null)
-      export PYTHON_SOABI := 
-      ifdef SOABI
-        ifneq ($(SOABI), None)
-          PYTHON_SOABI = .$(SOABI)
+        export PYTHON_CFLAGS := $(PYTHON_CONFIG_INCL) $(NUMPY_INCL_FLAG)
+        export PYTHON_LIBDIR := $(shell $(PYTHON_CONFIG_EXE) --prefix)/lib
+        export PYTHON_LIBS   := $(shell $(PYTHON_CONFIG_EXE) --libs)
+        export PYTHON_LINK :=
+        # export PYTHON_LINK   := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR)
+        # export PYTHON_LINK   := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR)
+        SOABI := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; print(sysconfig.get_config_var('SOABI'))" 2>/dev/null)
+        export PYTHON_SOABI := 
+        ifdef SOABI
+          ifneq ($(SOABI), None)
+            PYTHON_SOABI = .$(SOABI)
+          endif
         endif
-      endif
-      export PYTHON_SOABI
-      # export PYTHON_LIB_BASE := $(strip $(subst -l,lib, $(filter -lpython%,$(PYTHON_LIBS))))
-
-      ### Check if conda is being used on OSX - then we need to fix python link libraries
-      export FIX_PYTHON_LINK := 0
-      # ifeq ($(CONDA_BUILD), 0)
-      #   ## Check if conda build is under progress -> do nothing in that case. Let conda handle it
-      #   ifeq ($(UNAME), Darwin)
-      #     PATH_TO_PYTHON := $(shell which python)
-      #     ifeq (conda, $(findstring conda, $(PATH_TO_PYTHON)))
-      # 	    FIX_PYTHON_LINK := 1
-      #     endif
-      #   endif
-      # endif
-      ifeq ($(UNAME), Darwin)
-        # PYTHON_LINK := $(filter-out -framework, $(PYTHON_LINK))
-        # PYTHON_LINK := $(filter-out -ldl, $(PYTHON_LINK))
-        # PYTHON_LINK := $(filter-out CoreFoundation, $(PYTHON_LINK))
-        PYTHON_LINK += -dynamiclib -Wl,-compatibility_version,$(ABI_COMPAT_VERSION) -Wl,-current_version,$(VERSION) -undefined dynamic_lookup
-        PYTHON_LINK += -headerpad_max_install_names
-
-        ### Another check for stack-size. travis ci chokes on this with gcc
-        # comma := ,
-        # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
-        # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
-        # PYTHON_LINK := $(filter-out -stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
-      endif #Darwin checks
-      export PYTHON_CHECKED:=1
-    endif # compile python extensions
-  endif
+        export PYTHON_SOABI
+        # export PYTHON_LIB_BASE := $(strip $(subst -l,lib, $(filter -lpython%,$(PYTHON_LIBS))))
+
+        ### Check if conda is being used on OSX - then we need to fix python link libraries
+        export FIX_PYTHON_LINK := 0
+        # ifeq ($(CONDA_BUILD), 0)
+        #   ## Check if conda build is under progress -> do nothing in that case. Let conda handle it
+        #   ifeq ($(UNAME), Darwin)
+        #     PATH_TO_PYTHON := $(shell which python)
+        #     ifeq (conda, $(findstring conda, $(PATH_TO_PYTHON)))
+        # 	    FIX_PYTHON_LINK := 1
+        #     endif
+        #   endif
+        # endif
+        ifeq ($(UNAME), Darwin)
+          # PYTHON_LINK := $(filter-out -framework, $(PYTHON_LINK))
+          # PYTHON_LINK := $(filter-out -ldl, $(PYTHON_LINK))
+          # PYTHON_LINK := $(filter-out CoreFoundation, $(PYTHON_LINK))
+          PYTHON_LINK += -dynamiclib -Wl,-compatibility_version,$(ABI_COMPAT_VERSION) -Wl,-current_version,$(VERSION) -undefined dynamic_lookup
+          PYTHON_LINK += -headerpad_max_install_names
+
+          ### Another check for stack-size. travis ci chokes on this with gcc
+          # comma := ,
+          # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
+          # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
+          # PYTHON_LINK := $(filter-out -stack_size$(comma)1000000$(comma), $(PYTHON_LINK))
+        endif #Darwin checks
+        export PYTHON_FOUND :=1
+      endif # compile python extensions
+    else
+       $(warning There was an error running python -- currently set to $(ccblue)[${PYTHON}]$(ccreset))
+       $(warning Skipping the creation of python bindings)
+    endif ## ifdef PYTHON_FOUND
+  endif ## PYTHON_CHECKED
   ### Done with python checks
 
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1b5f3406..ad17b7db 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -175,7 +175,8 @@ def __getattr__(cls, name):
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
+html_static_path = []
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
diff --git a/mocks.options b/mocks.options
index e82ce2e6..e95844cb 100644
--- a/mocks.options
+++ b/mocks.options
@@ -1,6 +1,6 @@
-### Special option for DDrppi_mocks
+### Special option for DDrppi_mocks/DDsmu_mocks
 OPT += -DOUTPUT_RPAVG  ### Enabling this DOES NOT cause too much of a runtime-hit for DDrppi (<= 10% performance hit)
-#OPT += -DFAST_DIVIDE ##replaces divide in DDrppi with approximate divides. If you really must get that extra ~20% performance boost
+#OPT += -DFAST_DIVIDE=2 ##replaces a divide with approximate reciprocals, followed by 'FAST_DIVIDE' number of Newton-Raphson steps. Trade-off between speed and accuracy; may be slower on newer computers
 
 ### Specific options for wtheta (DDtheta_mocks.c)
 #OPT += -DOUTPUT_THETAAVG
diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src
index 46c25013..1618906c 100644
--- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src
+++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src
@@ -220,6 +220,12 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE
     if(options->c_api_timer) {
         gettimeofday(&t0, NULL);
     }
+    if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) {
+        fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u."
+                " Switching to a standard divide"ANSI_COLOR_RESET"\n",
+                options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS);
+        options->fast_divide_and_NR_steps = 0;
+    }
 
     //Check inputs
     if(ND1 == 0 || (autocorr == 0 && ND2 == 0)) {
@@ -604,7 +610,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE
                     const int status = countpairs_rp_pi_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
                                                                               N1, x1, y1, z1, d1, weights1,
                                                                               same_cell,
-                                                                              options->fast_divide,
+                                                                              options->fast_divide_and_NR_steps,
                                                                               sqr_rpmax, sqr_rpmin, nrpbin,
                                                                               npibin, rupp_sqr, pimax,max_sep,
                                                                               this_rpavg, npairs,
@@ -632,7 +638,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE
                     const int status = countpairs_rp_pi_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
                                                                               N2, x2, y2, z2, d2, weights2,
                                                                               same_cell,
-                                                                              options->fast_divide,
+                                                                              options->fast_divide_and_NR_steps,
                                                                               sqr_rpmax, sqr_rpmin, nrpbin,
                                                                               npibin, rupp_sqr, pimax,max_sep,
                                                                               this_rpavg, npairs,
diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src
index 91ca2777..14bc560a 100644
--- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src
+++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src
@@ -24,7 +24,7 @@ extern "C" {
     typedef int (*countpairs_mocks_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
                                                     const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
                                                     const int same_cell,
-                                                    const int fast_divide,
+                                                    const unsigned int fast_divide_and_NR_steps,
                                                     const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin,
                                                     const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep, 
                                                     DOUBLE *src_rpavg, uint64_t *src_npairs,
diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
index dde05f57..b2e8fad0 100644
--- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
+++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src
@@ -26,7 +26,7 @@
 static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
                                                                const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
                                                                const int same_cell,
-                                                               const int fast_divide,
+                                                               const unsigned int fast_divide_and_NR_steps,
                                                                const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin,
                                                                const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep,
                                                                DOUBLE *src_rpavg,
@@ -177,7 +177,7 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0,
 
             AVX_FLOATS m_sqr_Dpar, m_sqr_Dperp;
             {
-                const AVX_FLOATS m_dsep =  AVX_SUBTRACT_FLOATS(AVX_SQUARE_FLOAT(m_d2), AVX_SQUARE_FLOAT(m_dpos));
+                const AVX_FLOATS m_s_dot_l = AVX_SUBTRACT_FLOATS(AVX_SQUARE_FLOAT(m_d2), AVX_SQUARE_FLOAT(m_dpos));
                 
                 /* const AVX_FLOATS m_dz_mask = AVX_COMPARE_FLOATS(m_perpz, m_max_sep, _CMP_LT_OQ); */
                 /* if(AVX_TEST_COMPARISON(m_dz_mask) == 0) { */
@@ -185,12 +185,12 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0,
                 /*     break; */
                 /* } */
 
-                const AVX_FLOATS m_numerator = AVX_SQUARE_FLOAT(m_dsep);
+                const AVX_FLOATS m_sqr_s_dot_l = AVX_SQUARE_FLOAT(m_s_dot_l);
                 const AVX_FLOATS m_sqr_perpx = AVX_SQUARE_FLOAT(m_perpx);
                 const AVX_FLOATS m_sqr_perpy = AVX_SQUARE_FLOAT(m_perpy);
                 const AVX_FLOATS m_sqr_perpz = AVX_SQUARE_FLOAT(m_perpz);
                 const AVX_FLOATS m_sqr_sep = AVX_ADD_FLOATS(m_sqr_perpx, AVX_ADD_FLOATS(m_sqr_perpy, m_sqr_perpz));//3-d separation
-
+                
                 //The 3-d separation (| s.s |)^2 *must* be less than (pimax^2 + rpmax^2). If not, one of the
                 //constraints for counting the pair (i.e., rp < rpmax, \pi < pimax) must be violated and
                 //we would discard the pair.
@@ -201,53 +201,33 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0,
                 //However, division is slow -> so we will check if \pimax^2 * |l| ^2 < |s.l|^2. If not, then the
                 //value of \pi (after division) *must* be larger than \pimax -> in which case we would
                 //not count that pair anway.
-                const AVX_FLOATS m_sqr_pimax_times_l = AVX_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l);
-                const AVX_FLOATS m_mask_pimax_sep = AVX_COMPARE_FLOATS(m_numerator, m_sqr_pimax_times_l, _CMP_LT_OQ);// is pi < pimax ?
+                const AVX_FLOATS m_sqr_pimax_times_sqr_l = AVX_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l);
+                const AVX_FLOATS m_mask_pimax_sep = AVX_COMPARE_FLOATS(m_sqr_s_dot_l, m_sqr_pimax_times_sqr_l, _CMP_LT_OQ);// is pi < pimax ?
+
+#if 0
+                /* This could have been an additional check before the divide but does not seem to boost performance */
+                const AVX_FLOATS m_sqr_rpmax_times_sqr_l = AVX_MULTIPLY_FLOATS(m_sqr_rpmax, m_sqr_norm_l);
+                const AVX_FLOATS m_sqr_s_minus_sqr_sdotl = AVX_SUBTRACT_FLOATS(m_sqr_sep, m_sqr_s_dot_l);
+                const AVX_FLOATS m_sqr_rpmax_initial_mask = AVX_COMPARE_FLOATS(m_sqr_s_minus_sqr_sdotl, m_sqr_rpmax_times_sqr_l, _CMP_LT_OQ);/* is rp < rpmax */
+                
                 //If the bits are all 0, then *none* of the pairs satisfy the pimax + rpmax constraints.
+                const AVX_FLOATS m_mask = AVX_BITWISE_AND(AVX_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep), m_sqr_rpmax_initial_mask);
+#else
                 const AVX_FLOATS m_mask = AVX_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep);
+#endif
+
                 if(AVX_TEST_COMPARISON(m_mask)==0) {
                     continue;
                 }
 
-                if(fast_divide == 0) {
-                    //regular division -> slow op
-                    m_sqr_Dpar = AVX_DIVIDE_FLOATS(m_numerator,m_sqr_norm_l);
-                    //The divide is the actual operation we need
-                    // but divides are about 10x slower than multiplies. So, I am replacing it
-                    //with a approximate reciprocal in floating point
-                    // + 2 iterations of newton-raphson in case of DOUBLE
-                } else {
-                    //following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson
-                    //However, the exact implementation depends on the precision. floats have an inbuilt approx. reciprocal
-                    //but doubles do not. So, we have to 'fake' an approximate reciprocal for doubles by converting to float
-                    //taking the approximate reciprocal, and then convert back to double
-#ifndef DOUBLE_PREC
-                    const AVX_FLOATS rc  = _mm256_rcp_ps(m_sqr_norm_l);//intrinsic for 256 bit approximate reciprocal
-#else
-                    //we have to do this for doubles now.
-                    //if the vrcpps instruction is not generated, there will
-                    //be a ~70 cycle performance hit from switching between
-                    //AVX and SSE modes.
-                    const __m128 float_tmp1 =  _mm256_cvtpd_ps(m_sqr_norm_l);//convert double to float -> not avx_floats := _m256d 
-                                                                             //(convert 4 doubles into 4 floats -> use half of available 256 bit SIMD registers)
-                    __m128 float_inv_tmp1 = _mm_rcp_ps(float_tmp1);//intrinsic for 128 bit float approximate reciprocal
-                    const AVX_FLOATS rc = _mm256_cvtps_pd(float_inv_tmp1);//convert back to double
-#endif//DOUBLE_PREC
-
-                    //We have the double->float->approx. reciprocal->double process done.
-                    //Now improve the accuracy of the divide with newton-raphson.
-                    
-                    //Ist iteration of NewtonRaphson
-                    const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0);
-                    const AVX_FLOATS rc1 = AVX_MULTIPLY_FLOATS(rc,
-                                                               AVX_SUBTRACT_FLOATS(two,
-                                                                                   AVX_MULTIPLY_FLOATS(m_sqr_norm_l,rc)));
-                    //2nd iteration of NewtonRaphson
-                    const AVX_FLOATS rc2 = AVX_MULTIPLY_FLOATS(rc1,
-                                                               AVX_SUBTRACT_FLOATS(two,
-                                                                                   AVX_MULTIPLY_FLOATS(m_sqr_norm_l,rc1)));
-                    m_sqr_Dpar = AVX_MULTIPLY_FLOATS(m_numerator,rc2);
-                }//end of FAST_DIVIDE
+
+                /* Check if fast_divide is enabled and either use the normal divide or
+                   use the approx. reciprocal followed by `fast_divide_and_NR_steps` number
+                   of Newton-Raphson steps to improve numerical accuracy.
+
+                   macro is defined in `avx_calls.h`
+                 */
+                CHECK_AND_FAST_DIVIDE(m_sqr_Dpar, m_sqr_s_dot_l, m_sqr_norm_l, fast_divide_and_NR_steps);
                 
                 m_sqr_Dperp = AVX_SUBTRACT_FLOATS(m_sqr_sep,m_sqr_Dpar);
             }
@@ -413,7 +393,7 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0,
 static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
                                                                const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
                                                                const int same_cell,
-                                                               const int fast_divide,
+                                                               const unsigned int fast_divide_and_NR_steps,
                                                                const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin,
                                                                const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep,
                                                                DOUBLE *src_rpavg,
@@ -429,7 +409,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
 
     const int32_t need_rpavg = src_rpavg != NULL;
     const int32_t need_weightavg = src_weightavg != NULL;
-    (void) fast_divide; //unused
+    (void) fast_divide_and_NR_steps; //unused
 
     SSE_FLOATS m_rupp_sqr[nbin];
     for(int i=0;i<nbin;i++) {
@@ -578,7 +558,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
                 /* const SSE_FLOATS m_term1 = SSE_MULTIPLY_FLOATS(m_parx, m_perpx); */
                 /* const SSE_FLOATS m_term2 = SSE_MULTIPLY_FLOATS(m_pary, m_perpy); */
                 /* const SSE_FLOATS m_term3 = SSE_MULTIPLY_FLOATS(m_parz, m_perpz); */
-                const SSE_FLOATS m_numerator = SSE_SQUARE_FLOAT(m_dsep);
+                const SSE_FLOATS m_sqr_s_dot_l = SSE_SQUARE_FLOAT(m_dsep);
 
                 const SSE_FLOATS m_sqr_perpx = SSE_SQUARE_FLOAT(m_perpx);
                 const SSE_FLOATS m_sqr_perpy = SSE_SQUARE_FLOAT(m_perpy);
@@ -596,14 +576,14 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
                 //value of \pi (after division) *must* be larger than \pimax -> in which case we would
                 //not count that pair anway.
                 const SSE_FLOATS m_sqr_pimax_times_l = SSE_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l);
-                const SSE_FLOATS m_mask_pimax_sep = SSE_COMPARE_FLOATS_LT(m_numerator, m_sqr_pimax_times_l);// is pi < pimax ?
+                const SSE_FLOATS m_mask_pimax_sep = SSE_COMPARE_FLOATS_LT(m_sqr_s_dot_l, m_sqr_pimax_times_l);// is pi < pimax ?
                 //If the bits are all 0, then *none* of the pairs satisfy the pimax + rpmax constraints.
                 const SSE_FLOATS m_mask = SSE_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep);
                 if(SSE_TEST_COMPARISON(m_mask)==0) {
                     continue;
                 }
 
-                m_sqr_Dpar = SSE_DIVIDE_FLOATS(m_numerator,m_sqr_norm_l);
+                m_sqr_Dpar = SSE_DIVIDE_FLOATS(m_sqr_s_dot_l,m_sqr_norm_l);
                 //The divide is the actual operation we need
                 // but divides are about 10x slower than multiplies.
                 m_sqr_Dperp = SSE_SUBTRACT_FLOATS(m_sqr_sep,m_sqr_Dpar);
@@ -764,7 +744,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0,
 static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0,
                                                          const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1,
                                                          const int same_cell,
-                                                         const int fast_divide,
+                                                         const unsigned int fast_divide_and_NR_steps,
                                                          const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin,
                                                          const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep,
                                                          DOUBLE *src_rpavg, uint64_t *src_npairs,
@@ -781,7 +761,7 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL
     const int32_t need_rpavg = src_rpavg != NULL;
     const int32_t need_weightavg = src_weightavg != NULL;
 
-    (void) fast_divide;//unused parameter but required to keep the same function signature amongst the kernels
+    (void) fast_divide_and_NR_steps;//unused parameter but required to keep the same function signature amongst the kernels
     
     /*----------------- FALLBACK CODE --------------------*/
     const int64_t totnbins = (npibin+1)*(nbin+1);
diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src
index b4fbf26a..7443edf8 100644
--- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src
+++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src
@@ -221,6 +221,12 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D
     if(options->c_api_timer) {
         gettimeofday(&t0, NULL);
     }
+    if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) {
+        fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u."
+                " Switching to a standard divide"ANSI_COLOR_RESET"\n",
+                options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS);
+        options->fast_divide_and_NR_steps = 0;
+    }
 
     //Check inputs
     if(ND1 == 0 || (autocorr == 0 && ND2 == 0)) {
@@ -606,7 +612,7 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D
                     const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
                                                                              N1, x1, y1, z1, d1, weights1,
                                                                              same_cell,
-                                                                             options->fast_divide,
+                                                                             options->fast_divide_and_NR_steps,
                                                                              smax, smin, nsbin,
                                                                              nmu_bins, supp_sqr, mu_max,
                                                                              this_savg, npairs,
@@ -634,7 +640,7 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D
                     const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1,
                                                                              N2, x2, y2, z2, d2, weights2,
                                                                              same_cell,
-                                                                             options->fast_divide,
+                                                                             options->fast_divide_and_NR_steps,
                                                                              smax, smin, nsbin,
                                                                              nmu_bins, supp_sqr, mu_max,
                                                                              this_savg, npairs,
diff --git a/mocks/python_bindings/_countpairs_mocks.c b/mocks/python_bindings/_countpairs_mocks.c
index 2d34bcc4..2240aa6c 100644
--- a/mocks/python_bindings/_countpairs_mocks.c
+++ b/mocks/python_bindings/_countpairs_mocks.c
@@ -75,7 +75,7 @@ static PyMethodDef module_methods[] = {
      "                       RA2=None, DEC2=None, CZ2=None, weights2=None,\n"
      "                       is_comoving_dist=False,\n"
      "                       verbose=False, output_rpavg=False,\n"
-     "                       fast_divide=False, xbin_refine_factor=2, \n"
+     "                       fast_divide_and_NR_steps=0, xbin_refine_factor=2, \n"
      "                       ybin_refine_factor=2, zbin_refine_factor=1, \n"
      "                       max_cells_per_dim=100, \n"
      "                       c_api_timer=False, isa=-1)\n"
@@ -175,10 +175,11 @@ static PyMethodDef module_methods[] = {
      "   precision and can not be trusted. If you need accurate ``rpavg``\n"
      "   values, then pass in double precision arrays for the particle positions.\n"
      "\n"
-     "fast_divide: boolean (default false)\n"
-     "   Boolean flag to replace the division in ``AVX`` implementation with an\n"
-     "   approximate reciprocal, followed by a Newton-Raphson step. Improves\n"
-     "   runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.\n"
+     "fast_divide_and_NR_steps: integer (default 0)\n"
+     "   Replaces the division in ``AVX`` implementation with an\n"
+     "   approximate reciprocal, followed by ``fast_divide_and_NR_steps`` "
+     "   Newton-Raphson step. Can improve \n"
+     "   runtime by ~15-20%. Value of 0 keeps the standard division.\n"
      "\n"
      "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n"
      "   Controls the refinement on the cell sizes. Can have up to a 20% impact \n"
@@ -242,7 +243,7 @@ static PyMethodDef module_methods[] = {
          "                       RA2=None, DEC2=None, CZ2=None, weights2=None,\n"
          "                       is_comoving_dist=False,\n"
          "                       verbose=False, output_savg=False,\n"
-         "                       fast_divide=False, xbin_refine_factor=2, \n"
+         "                       fast_divide_and_NR_steps=0, xbin_refine_factor=2, \n"
          "                       ybin_refine_factor=2, zbin_refine_factor=1, \n"
          "                       max_cells_per_dim=100, \n"
          "                       c_api_timer=False, isa=-1)\n"
@@ -338,10 +339,11 @@ static PyMethodDef module_methods[] = {
          "   precision and can not be trusted. If you need accurate ``savg``\n"
          "   values, then pass in double precision arrays for the particle positions.\n"
          "\n"
-         "fast_divide: boolean (default false)\n"
-         "   Boolean flag to replace the division in ``AVX`` implementation with an\n"
-         "   approximate reciprocal, followed by a Newton-Raphson step. Improves\n"
-         "   runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.\n"
+         "fast_divide_and_NR_steps: integer (default 0)\n"
+         "   Replaces the division in ``AVX`` implementation with an\n"
+         "   approximate reciprocal, followed by ``fast_divide_and_NR_steps`` "
+         "   Newton-Raphson step. Can improve \n"
+         "   runtime by ~15-20%. Value of 0 keeps the standard division.\n"
          "\n"
          "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n"
          "   Controls the refinement on the cell sizes. Can have up to a 20% impact \n"
@@ -1062,7 +1064,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
     options.verbose = 0;
     options.instruction_set = -1;
     options.periodic = 0;
-    options.fast_divide=0;
+    options.fast_divide_and_NR_steps=0;
     options.c_api_timer = 0;
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
@@ -1091,7 +1093,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
         "is_comoving_dist",
         "verbose", /* keyword verbose -> print extra info at runtime + progressbar */
         "output_rpavg",
-        "fast_divide",
+        "fast_divide_and_NR_steps",
         "xbin_refine_factor",
         "ybin_refine_factor",
         "zbin_refine_factor",
@@ -1115,7 +1117,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg
                                        &(options.is_comoving_dist),
                                        &(options.verbose),
                                        &(options.need_avg_sep),
-                                       &(options.fast_divide),
+                                       &(options.fast_divide_and_NR_steps),
                                        &xbin_ref, &ybin_ref, &zbin_ref,
                                        &(options.max_cells_per_dim),
                                        &(options.c_api_timer),
@@ -1388,7 +1390,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args
     options.verbose = 0;
     options.instruction_set = -1;
     options.periodic = 0;
-    options.fast_divide=0;
+    options.fast_divide_and_NR_steps=0;
     options.c_api_timer = 0;
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
@@ -1419,7 +1421,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args
         "is_comoving_dist",
         "verbose", /* keyword verbose -> print extra info at runtime + progressbar */
         "output_savg",
-        "fast_divide",
+        "fast_divide_and_NR_steps",
         "xbin_refine_factor",
         "ybin_refine_factor",
         "zbin_refine_factor",
@@ -1443,7 +1445,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args
                                        &(options.is_comoving_dist),
                                        &(options.verbose),
                                        &(options.need_avg_sep),
-                                       &(options.fast_divide),
+                                       &(options.fast_divide_and_NR_steps),
                                        &xbin_ref, &ybin_ref, &zbin_ref,
                                        &(options.max_cells_per_dim),
                                        &(options.c_api_timer),
diff --git a/mocks/tests/tests_mocks.c b/mocks/tests/tests_mocks.c
index 2f5a15cd..61f94fd2 100644
--- a/mocks/tests/tests_mocks.c
+++ b/mocks/tests/tests_mocks.c
@@ -537,9 +537,8 @@ int main(int argc, char **argv)
     options.verbose=0;
     options.periodic=0;
     options.float_type=sizeof(double);
-    options.fast_divide=0;
+    options.fast_divide_and_NR_steps=0;
     options.fast_acos=0;
-    //options.instruction_set = FALLBACK;
 
     int status = init_cosmology(cosmology_flag);
     if(status != EXIT_SUCCESS) {
diff --git a/rules.mk b/rules.mk
index a7756045..80900870 100644
--- a/rules.mk
+++ b/rules.mk
@@ -56,7 +56,7 @@ $(TARGET).o: $(TARGET).c $(ROOT_DIR)/common.mk Makefile $(ROOT_DIR)/theory.optio
 %_float.o: %_float.c
 	$(CC) -DNDOUBLE_PREC $(CFLAGS) $(INCLUDE) $(EXTRA_INCL) -c $< -o $@
 
-%.o: %.c $(ROOT_DIR)/common.mk Makefile
+%.o: %.c $(ROOT_DIR)/common.mk $(ROOT_DIR)/utils/defs.h Makefile
 	$(CC) $(CFLAGS) $(INCLUDE) $(EXTRA_INCL) -c $< -o $@
 
 $(LIBRARY): $(LIBOBJS) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile
diff --git a/theory.options b/theory.options
index 1bc87bf5..686555b5 100644
--- a/theory.options
+++ b/theory.options
@@ -1,9 +1,10 @@
 #### Science use-cases for Theory Correlation Functions
 OPT = -DPERIODIC
 #OPT += -DOUTPUT_RPAVG  ### Enabling this can cause up to a 2x performance hit
+#OPT += -DFAST_DIVIDE=2 ##replaces a divide (in DDsmu) with approximate reciprocals, followed by 'FAST_DIVIDE' number of Newton-Raphson steps. Trade-off between speed and accuracy; may be slower on newer computers
 
 #### Code specs for both theory and data Correlation Functions
-OPT += -DDOUBLE_PREC
+#OPT += -DDOUBLE_PREC
 
 
 
diff --git a/theory/DDsmu/Makefile b/theory/DDsmu/Makefile
index ca63851a..4c498b5a 100644
--- a/theory/DDsmu/Makefile
+++ b/theory/DDsmu/Makefile
@@ -25,15 +25,16 @@ INCL   := countpairs_s_mu_kernels_float.c countpairs_s_mu_kernels_double.c count
           $(UTILS_DIR)/defs.h $(UTILS_DIR)/cpu_features.h \
           $(IO_DIR)/ftread.h $(IO_DIR)/io.h $(UTILS_DIR)/utils.h $(UTILS_DIR)/progressbar.h \
           $(UTILS_DIR)/weight_functions_double.h $(UTILS_DIR)/weight_functions_float.h $(UTILS_DIR)/weight_functions.h.src \
-		  $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src
+	  $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src
 
 TARGETOBJS  := $(TARGETSRC:.c=.o)
 LIBOBJS := $(LIBSRC:.c=.o)
 all: $(TARGETS) $(TARGETSRC) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile 
 
-countpairs_s_mu_impl_double.o:countpairs_s_mu_impl_double.c countpairs_s_mu_impl_double.h countpairs_s_mu_kernels_double.c $(UTILS_DIR)/gridlink_impl_double.h  $(UTILS_DIR)/cellarray_double.h
-countpairs_s_mu_impl_float.o:countpairs_s_mu_impl_float.c countpairs_s_mu_impl_float.h countpairs_s_mu_kernels_float.c $(UTILS_DIR)/gridlink_impl_float.h  $(UTILS_DIR)/cellarray_float.h
-countpairs_s_mu.o:countpairs_s_mu.c countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h $(INCL)
+countpairs_s_mu_impl_double.o:countpairs_s_mu_impl_double.c countpairs_s_mu_impl_double.h countpairs_s_mu_kernels_double.c $(UTILS_DIR)/gridlink_impl_double.h 
+countpairs_s_mu_impl_float.o:countpairs_s_mu_impl_float.c countpairs_s_mu_impl_float.h countpairs_s_mu_kernels_float.c $(UTILS_DIR)/gridlink_impl_float.h
+countpairs_s_mu.o:countpairs_s_mu.c countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h countpairs_s_mu.h $(INCL)
+countpairs_s_mu_impl_float.c countpairs_s_mu_impl_double.c:countpairs_s_mu_impl.c.src $(INCL)
 
 libs: lib
 lib:  $(LIBRARY)
diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src
index 494a6aff..fbbd6d05 100644
--- a/theory/DDsmu/countpairs_s_mu_impl.c.src
+++ b/theory/DDsmu/countpairs_s_mu_impl.c.src
@@ -174,6 +174,13 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1
         options->max_cells_per_dim = NLATMAX;
     }
     
+    if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) {
+        fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u."
+                " Switching to a standard divide"ANSI_COLOR_RESET"\n",
+                options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS);
+        options->fast_divide_and_NR_steps = 0;
+    }
+
     /* setup interrupt handler -> mostly useful during the python execution. 
        Let's Ctrl-C abort the extension  */
     SETUP_INTERRUPT_HANDLERS(interrupt_handler_countpairs_s_mu_DOUBLE);
@@ -449,10 +456,11 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1
                     }
                     const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1,
                                                                        N1, x1, y1, z1, weights1,
-                                                                       same_cell
-                                                                       ,sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax
-                                                                       ,ZERO, ZERO, ZERO
-                                                                       ,this_savg, npairs,
+                                                                       same_cell,
+                                                                       options->fast_divide_and_NR_steps,
+                                                                       sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax,
+                                                                       ZERO, ZERO, ZERO,
+                                                                       this_savg, npairs,
                                                                        this_weightavg, extra->weight_method);
                     /* This actually causes a race condition under OpenMP - but mostly
                        I care that an error occurred - rather than the exact value of
@@ -485,7 +493,9 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1
                         this_weightavg = weightavg;
                     }
                     const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1,
-                                                                       N2, x2, y2, z2, weights2, same_cell,
+                                                                       N2, x2, y2, z2, weights2,
+                                                                       same_cell,
+                                                                       options->fast_divide_and_NR_steps,
                                                                        sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax, 
                                                                        off_xwrap, off_ywrap, off_zwrap,
                                                                        this_savg, npairs,
diff --git a/theory/DDsmu/countpairs_s_mu_impl.h.src b/theory/DDsmu/countpairs_s_mu_impl.h.src
index 7194b827..e9f57cd3 100644
--- a/theory/DDsmu/countpairs_s_mu_impl.h.src
+++ b/theory/DDsmu/countpairs_s_mu_impl.h.src
@@ -22,7 +22,9 @@ extern "C" {
     extern void interrupt_handler_countpairs_s_mu_DOUBLE(int signo);
     
     typedef int (*countpairs_s_mu_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
-                                                   const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                   const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1,
+                                                   const int same_cell,
+                                                   const unsigned int fast_divide_and_NR_steps,
                                                    const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
                                                    const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
                                                    const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
diff --git a/theory/DDsmu/countpairs_s_mu_kernels.c.src b/theory/DDsmu/countpairs_s_mu_kernels.c.src
index 72bf8044..00e026f7 100644
--- a/theory/DDsmu/countpairs_s_mu_kernels.c.src
+++ b/theory/DDsmu/countpairs_s_mu_kernels.c.src
@@ -23,7 +23,9 @@
 #include "avx_calls.h"
 
 static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
-                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1,
+                                                        const int same_cell,
+                                                        const unsigned int fast_divide_and_NR_steps,
                                                         const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin,
                                                         const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
                                                         const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
@@ -211,9 +213,20 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE
 
             //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max
             s2 = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left);
-            /*m_mu := sqrt(s2/dz^2) (with masked elements set to mu_max */
-            const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, AVX_DIVIDE_FLOATS(m_sqr_zdiff, s2), m_mask_left));
-
+            /*m_sqr_mu := dz^2/s^2 (with masked elements set to mu_max */
+            AVX_FLOATS m_sqr_mu = AVX_SETZERO_FLOAT();
+
+
+            /* Check if fast_divide is enabled and either use the normal divide or
+               use the approx. reciprocal followed by `fast_divide_and_NR_steps` number
+               of Newton-Raphson steps to improve numerical accuracy.
+               
+               macro is defined in `avx_calls.h`
+            */
+            CHECK_AND_FAST_DIVIDE(m_sqr_mu, m_sqr_zdiff, s2, fast_divide_and_NR_steps);
+                
+            const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, m_sqr_mu, m_mask_left));
+            
             if(need_savg) {
                 union_mDperp.m_Dperp = AVX_SQRT_FLOAT(s2);
             }
@@ -331,13 +344,17 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE
 #include "sse_calls.h"
 
 static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
-                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell,
+                                                        const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1,
+                                                        const int same_cell,
+                                                        const unsigned int fast_divide_and_NR_steps,
                                                         const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
                                                         const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
                                                         const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
                                                         DOUBLE *src_savg, uint64_t *src_npairs,
                                                         DOUBLE *src_weightavg, const weight_method_t weight_method)
 {
+    (void) fast_divide_and_NR_steps;
+    
     if(N0 == 0 || N1 == 0) {
         return EXIT_SUCCESS;
     }
@@ -633,6 +650,7 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE
 static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0,
                                                   const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1,
                                                   const int same_cell,
+                                                  const unsigned int fast_divide_and_NR_steps,
                                                   const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins,
                                                   const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax,
                                                   const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap,
@@ -640,6 +658,7 @@ static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0,
                                                   DOUBLE *src_weightavg, const weight_method_t weight_method)
 {
 
+    (void) fast_divide_and_NR_steps;
     if(N0 == 0 || N1 == 0) {
         return EXIT_SUCCESS;
     }
diff --git a/theory/python_bindings/_countpairs.c b/theory/python_bindings/_countpairs.c
index 38debd15..22de4a66 100644
--- a/theory/python_bindings/_countpairs.c
+++ b/theory/python_bindings/_countpairs.c
@@ -607,8 +607,9 @@ static PyMethodDef module_methods[] = {
     {"countpairs_s_mu"      ,(PyCFunction) countpairs_countpairs_s_mu ,METH_VARARGS | METH_KEYWORDS,
      "countpairs_s_mu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None, weight_type=None,\n"
      "                periodic=True, X2=None, Y2=None, Z2=None, weights2=None, verbose=False,\n"
-     "                boxsize=0.0, output_savg=False, xbin_refine_factor=2, ybin_refine_factor=2,\n"
-     "                zbin_refine_factor=1, max_cells_per_dim=100, c_api_timer=False, isa=-1)\n"
+     "                boxsize=0.0, output_savg=False, fast_divide_and_NR_steps=0,\n"
+     "                xbin_refine_factor=2, ybin_refine_factor=2, zbin_refine_factor=1,\n"
+     "                max_cells_per_dim=100, c_api_timer=False, isa=-1)\n"
      "\n"
      "Calculate the 2-D pair-counts corresponding to the real-space correlation\n"
      "function, "XI_CHAR"(s, "MU_CHAR"). Pairs which are separated\n"
@@ -691,6 +692,13 @@ static PyMethodDef module_methods[] = {
      "   values, then pass in double precision arrays for the particle positions.\n"
      "\n"
 
+     "fast_divide_and_NR_steps: integer (default 0)\n"
+     "   Replaces the division in ``AVX`` implementation with an\n"
+     "   approximate reciprocal, followed by ``fast_divide_and_NR_steps`` "
+     "   Newton-Raphson step. Can improve \n"
+     "   runtime by ~15-20%. Value of 0 keeps the standard division.\n"
+     "\n"
+     
      "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n"
      "   Controls the refinement on the cell sizes. Can have up to a 20% impact \n"
      "   on runtime. \n\n"
@@ -2167,6 +2175,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb
     options.instruction_set = -1;
     options.periodic = 1;
     options.c_api_timer = 0;
+    options.fast_divide_and_NR_steps = 0;
     int8_t xbin_ref=options.bin_refine_factors[0],
         ybin_ref=options.bin_refine_factors[1],
         zbin_ref=options.bin_refine_factors[2];
@@ -2189,6 +2198,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb
         "verbose", /* keyword verbose -> print extra info at runtime + progressbar */
         "boxsize",
         "output_savg",
+        "fast_divide_and_NR_steps",
         "xbin_refine_factor",
         "ybin_refine_factor",
         "zbin_refine_factor",
@@ -2199,7 +2209,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb
         NULL
     };
 
-    if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iisdiO!O!O!|O!O!O!O!O!bbdbbbbhbis", kwlist,
+    if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iisdiO!O!O!|O!O!O!O!O!bbdbbbbbhbis", kwlist,
                                        &autocorr,&nthreads,&binfile, &mu_max, &nmu_bins,
                                        &PyArray_Type,&x1_obj,
                                        &PyArray_Type,&y1_obj,
@@ -2213,6 +2223,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb
                                        &(options.verbose),
                                        &(options.boxsize),
                                        &(options.need_avg_sep),
+                                       &(options.fast_divide_and_NR_steps),
                                        &xbin_ref, &ybin_ref, &zbin_ref,
                                        &(options.max_cells_per_dim),
                                        &(options.c_api_timer),
@@ -2224,7 +2235,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb
         fprintf(stdout, "\n");
 
         char msg[1024];
-        int len=snprintf(msg, 1024,"ArgumentError: In DDsmu> Could not parse the arguments. Input parameters are: \n");
+        int len=snprintf(msg, 1024,"ArgumentError: In %s> Could not parse the arguments. Input parameters are: \n", __FUNCTION__);
 
         /* How many keywords do we have? Subtract 1 because of the last NULL */
         const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1;
diff --git a/theory/tests/Makefile b/theory/tests/Makefile
index 41e7d2d1..5700748d 100644
--- a/theory/tests/Makefile
+++ b/theory/tests/Makefile
@@ -23,10 +23,11 @@ VPF_LIB := countspheres
 include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk
 
 TARGETS := test_periodic test_nonperiodic
+
 ifneq ($(COMPILE_PYTHON_EXT), 0)
-TARGETS += python_lib
+  TARGETS += python_lib
 else
-$(warning $(ccmagenta) Skipping python tests since python or numpy is unavailable $(ccreset))
+  $(warning $(ccmagenta) Skipping python tests since python or numpy is unavailable $(ccreset))
 endif
 
 SRC1   := test_periodic.c $(IO_DIR)/io.c $(IO_DIR)/ftread.c $(UTILS_DIR)/utils.c
diff --git a/theory/tests/test_nonperiodic.c b/theory/tests/test_nonperiodic.c
index 1a6300b4..33fc47e0 100644
--- a/theory/tests/test_nonperiodic.c
+++ b/theory/tests/test_nonperiodic.c
@@ -335,8 +335,9 @@ int main(int argc, char **argv)
     options.need_avg_sep=1;
     options.verbose=0;
     options.periodic=0;
+    options.fast_divide_and_NR_steps=0;
     options.float_type=sizeof(double);
-
+    
     gettimeofday(&tstart,NULL);
 
     //set the globals
diff --git a/theory/tests/test_periodic.c b/theory/tests/test_periodic.c
index 9a0e6615..0ec8b77b 100644
--- a/theory/tests/test_periodic.c
+++ b/theory/tests/test_periodic.c
@@ -576,8 +576,8 @@ int main(int argc, char **argv)
     options.need_avg_sep=1;
     options.verbose=0;
     options.periodic=1;
+    options.fast_divide_and_NR_steps=0;
     options.float_type=sizeof(double);
-    //options.instruction_set = FALLBACK;
 
     char file[]="../tests/data/gals_Mr19.ff";
     char fileformat[]="f";
diff --git a/utils/Makefile b/utils/Makefile
index 5d9766fb..599acce9 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -11,8 +11,8 @@ INCL  := avx_calls.h sse_calls.h defs.h defs.h function_precision.h cosmology_pa
          gridlink_impl_double.h gridlink_impl_float.h gridlink_impl.c.src gridlink_impl.h.src \
          gridlink_mocks_impl_float.h gridlink_mocks_impl_double.h gridlink_mocks_impl.h.src gridlink_mocks_impl.c.src \
          progressbar.h set_cosmo_dist.h set_cosmology.h sglib.h utils.h \
-		 weight_functions_double.h weight_functions_float.h weight_functions.h.src \
-		 weight_defs_double.h weight_defs_float.h weight_defs.h.src
+	 weight_functions_double.h weight_functions_float.h weight_functions.h.src \
+	 weight_defs_double.h weight_defs_float.h weight_defs.h.src
 
 all: $(TARGETOBJS) Makefile $(ROOT_DIR)/common.mk $(ROOT_DIR)/theory.options $(ROOT_DIR)/mocks.options
 
diff --git a/utils/avx_calls.h b/utils/avx_calls.h
index 52a545aa..37db6a8b 100644
--- a/utils/avx_calls.h
+++ b/utils/avx_calls.h
@@ -39,7 +39,9 @@ extern "C" {
 #define AVX_NVEC                         8    
 #define AVX_INTS                         __m256i
 #define AVX_FLOATS                       __m256
-  
+
+#define AVX_SETZERO_FLOAT()              _mm256_setzero_ps()
+    
 #define AVX_LOAD_FLOATS_UNALIGNED(X)     _mm256_loadu_ps(X)
 #define AVX_LOAD_FLOATS_ALIGNED(X)       _mm256_load_ps(X)
 #define AVX_MULTIPLY_FLOATS(X,Y)         _mm256_mul_ps(X,Y)
@@ -103,6 +105,8 @@ extern "C" {
 #define AVX_INTS                         __m128i
 #define AVX_FLOATS                       __m256d
 
+#define AVX_SETZERO_FLOAT()              _mm256_setzero_pd()
+    
 #define AVX_LOAD_FLOATS_UNALIGNED(X)     _mm256_loadu_pd(X)
 #define AVX_LOAD_FLOATS_ALIGNED(X)       _mm256_load_pd(X)
 #define AVX_MULTIPLY_FLOATS(X,Y)         _mm256_mul_pd(X,Y)
@@ -198,6 +202,65 @@ static inline AVX_FLOATS inv_cosine_avx(const AVX_FLOATS X, const int order)
 #endif
 
 
+
+#ifdef DOUBLE_PREC    
+#define CHECK_AND_FAST_DIVIDE(result, numerator, denominator, fast_divide_and_NR_steps)                      { \
+        /* For double precision floats */                               \
+        if (fast_divide_and_NR_steps == 0) {                            \
+            result = AVX_DIVIDE_FLOATS(numerator, denominator);         \
+            /* The divide is the actual operation we need */            \
+            /* but divides are about 10x slower than multiplies. So, I am replacing it */ \
+            /* with a approximate reciprocal in floating point */       \
+            /* + 2 iterations of newton-raphson in case of DOUBLE */    \
+        } else {                                                        \
+            unsigned int _ii;                                           \
+            /* following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson */ \
+            const __m128 float_tmp1 =  _mm256_cvtpd_ps(denominator);/* convert double to float -> not avx_floats := _m256d */ \
+            /*(convert 4 doubles into 4 floats -> use half of available 256 bit SIMD registers) */ \
+            __m128 float_inv_tmp1 = _mm_rcp_ps(float_tmp1);/* intrinsic for 128 bit float approximate reciprocal */ \
+            const AVX_FLOATS rc = _mm256_cvtps_pd(float_inv_tmp1);/* convert back to double */ \
+            /* We have the double->float->approx. reciprocal->double process done. */ \
+            /* Now improve the accuracy of the divide with newton-raphson. */ \
+            const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0);         \
+            AVX_FLOATS rc_iter = rc;                                    \
+            /* Do NewtonRaphson iterations */                           \
+            for(_ii=0;_ii<fast_divide_and_NR_steps;_ii++) {              \
+                rc_iter = AVX_MULTIPLY_FLOATS(rc_iter,                  \
+                                              AVX_SUBTRACT_FLOATS(two,  \
+                                                                  AVX_MULTIPLY_FLOATS(denominator, rc_iter))); /*2.0 - l^2*rc */ \
+            }                                                           \
+            result = AVX_MULTIPLY_FLOATS(numerator, rc_iter);           \
+        } /* end of FAST_DIVIDE */                                      \
+    }
+#else
+#define CHECK_AND_FAST_DIVIDE(result, numerator, denominator, fast_divide_and_NR_steps)                      { \
+        /* single precision floats */                                   \
+        if (fast_divide_and_NR_steps == 0) {                            \
+            result = AVX_DIVIDE_FLOATS(numerator, denominator);         \
+            /* The divide is the actual operation we need */            \
+            /* but divides are about 10x slower than multiplies. So, I am replacing it */ \
+            /* with a approximate reciprocal in floating point */       \
+            /* + 2 iterations of newton-raphson in case of DOUBLE */    \
+        } else {                                                        \
+            unsigned int _ii;                                                    \
+            /* following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson */ \
+            const AVX_FLOATS rc  = _mm256_rcp_ps(denominator);/* intrinsic for 256 bit approximate reciprocal */ \
+            /* We have the double->float->approx. reciprocal->double process done. */ \
+            /* Now improve the accuracy of the divide with newton-raphson. */ \
+            const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0);         \
+            AVX_FLOATS rc_iter = rc;                                    \
+            /* Do NewtonRaphson iterations */                           \
+            for(_ii=0;_ii<fast_divide_and_NR_steps;_ii++) {             \
+                rc_iter = AVX_MULTIPLY_FLOATS(rc_iter,                  \
+                                              AVX_SUBTRACT_FLOATS(two,  \
+                                                                  AVX_MULTIPLY_FLOATS(denominator, rc_iter))); /*2.0 - l^2*rc */ \
+            }                                                           \
+            result = AVX_MULTIPLY_FLOATS(numerator, rc_iter);           \
+        } /* end of FAST_DIVIDE */                                      \
+    }
+#endif /* end of DOUBLE_PREC for defining check_and_fast_divide macro */
+
+    
 #ifdef __cplusplus
 }
 #endif
diff --git a/utils/defs.h b/utils/defs.h
index ffe26016..915d83f2 100644
--- a/utils/defs.h
+++ b/utils/defs.h
@@ -48,7 +48,7 @@ the ``uint32_t binning_flags`` */
 
 #define BINNING_DFL   0x0
 #define BINNING_CUST  0x1
-    
+
 struct api_cell_timings
 {
     int64_t N1;/* Number of points in the first cell*/
@@ -60,7 +60,9 @@ struct api_cell_timings
 };
     
 
-#define OPTIONS_HEADER_SIZE     (1024)
+#define MAX_FAST_DIVIDE_NR_STEPS  6
+#define OPTIONS_HEADER_SIZE     1024
+
 struct config_options
 {
     /* The fields should appear here in decreasing order of 
@@ -122,7 +124,13 @@ struct config_options
     uint8_t link_in_ra; /* relevant for DDtheta_mocks.*/
 
     /* Replaces the divide in DDrppi_mocks in AVX mode by a reciprocal and a Newton-Raphson step. */
-    uint8_t fast_divide;//Only used in AVX
+    uint8_t fast_divide_and_NR_steps;/* Used in AVX512/AVX; if set to 0, the standard (slow) divide is used
+                                        If > 0, the value is interpreted as the number of NR steps
+                                        i.e., fast_divide_and_NR_steps = 2, performs two steps of Newton-Raphson
+                                        Anything greater than ~5, probably makes the code slower than the
+                                        divide without any improvement in precision
+                                      */
+    
 
     /* Fast arccos for wtheta (effective only when OUTPUT_THETAAVG is enabled) */
     uint8_t fast_acos;
@@ -269,13 +277,11 @@ static inline struct config_options get_config_options(void)
 #endif
 
     /* Options specific to mocks */
-
     /* Options for DDrppi_mocks */
-#ifdef FAST_DIVIDE
-    options.fast_divide=1;
+#if defined(FAST_DIVIDE)
+    options.fast_divide_and_NR_steps=FAST_DIVIDE;
 #endif
 
-    
     /* Options for wtheta*/
 #ifdef OUTPUT_THETAAVG
     options.need_avg_sep = 1;