From 24c73b2e7473888782e01b0e7008ff4379b0fc45 Mon Sep 17 00:00:00 2001 From: Manodeep Sinha Date: Sat, 18 Aug 2018 06:23:10 +1000 Subject: [PATCH] ddsmu (#166) * DD(s,mu) function for mocks/theory (#130) * Updated README.rst [ci skip] Weights are on `pip`. Changed repo files to be links. * add a DDsmu mocks function * remove extra slash * add tests of DDsmu_mocks * add name to authors * update to add a mu_max function parameter * bug fixes; verified output against kdcount for different mu_max for AVX, SSE42, and fallback * adding theory DDsmu; verified for different mu_max and all ISA against kdcount * update docs * include DDsmu in theory/tests * fix type error * forgot to remove other variable definition * Updating the docs for (theory) DDsmu * Reviewed the theory functions (still need comprehensive tests and update RTD docs) * Adding the new file, tests_common.h, to allow integration tests (exhaustive tests for new pair-counters). * My (broken) mocks code * Fixing bugs uncovered by doctests (which are still not failing the build) * Trying to solve the doctests failures and the warnings raised during compiling the docs for DDsmu * I have a suspicion that doctests are not failing the build because they are in the 'after_success' part. Moved the doctests into the tests section. Might solve #143 * Attempting to fix #144 * Fixed the Makefile for DDsmu tests * Added the tests for DDsmu_mocks into the Makefile * Whitespace changes only for better readability [ci skip] * Corrected the variable type for nmu_bins and some small changes for better code readability * The output file for DDsmu_mocks.DD really corresponds to DDsmu_mocks.RR (see #132) * Fixed the DDsmu_mocks tests * Changed the name of the DDsmu_mocks test from DD->RR. Put the name of each test on a new line * Attempting to fix travis failure (from doctest failure) * Another attempt at fixing the doctest failure on travis * Next attempt at fixing doctest failure * Small change to the auto-generated docs [ci skip] * Doctests are failing because numpy does not honour set_printoptions for structured arrays (numpy issue #5606). This numpy issue seems to have been solved in 1.12. Bumping the default travis numpy version to 1.12 * Still trying to fix doctest failures. Now removed testing for python3.3 and added python3.6 * Missed the 'then' in the if condition. Added a xcode9 image for osx tests * Added a python3.6 for osx and changed the python version to python2.7 for xcode6 and xcode7 * Corrected the miniconda installer filenames for python2 * Added the numpy version=1.7 for testing the minimum requirements on osx * Added C mode declaration for syntax highlighting [ci skip] * Made sure that mu_max is specified before nmu_bins. Changed the ordering in the python extension as well * Added example C codes for the DDsmu and DDsmu_mocks pair-counters * The case of a mis-placed dot (or how to break the build) * Enforce that mu_max is scalar and greater than 0 * pimax is not required for DDsmu_mocks. Correctly added the parx/pary/parz components into the pair-weight struct for DDsmu_mocks and DDrppi_mocks. Renamed variables to make context clearer (will need to be done for DDrppi_mocks as well) * Renamed sqr_sep to sqr_s and removed checks for pimax * Changed the kernel parameters to smax/smin from sqr_smax/sqr_smin * The AVX tests pass now for DDsmu_mocks * Fixed the INTEGRATION_TEST section for DDtheta_mocks * Updated docstrings in python bindings for DDsmu and DDsmu_mocks * Added docs for DDsmu and DDsmu_mocks. Fixed the docstring formatting (removed notes within function docstrings) * Added the missing variable for doctests * Renamed w(theta) to DD(theta) and changed some text formatting * I forgot to fix the DDsmu_mocks file for the doctest failure * DDsmu PR is now ready to be merged. Bumping version to 2.1 * README updated to show that github pages are no longer being published [ci skip] * Filled in some more missing docs/docstrings * Remove further references to github pages site [ci skip] * Adding in the fast_divide option to theory/DDsmu paircounter. Not tested * Fixing the typos in fast-divide part of DDsmu. Added in other changes as well -- oops * Added in the fast_divide option into the main python wrappers. Fixed build failure * Added entries for the upcoming versions and features [ci skip] * Hopefully fixing build failure * Attempting to fix warning during building docs * Add PR # to changelog --- .travis.yml | 7 - CHANGES.rst | 11 +- Corrfunc/mocks/DDrppi_mocks.py | 19 +- Corrfunc/mocks/DDsmu_mocks.py | 28 +- Corrfunc/theory/DDsmu.py | 18 +- common.mk | 241 +++++++++--------- docs/source/conf.py | 3 +- mocks.options | 4 +- .../countpairs_rp_pi_mocks_impl.c.src | 10 +- .../countpairs_rp_pi_mocks_impl.h.src | 2 +- .../countpairs_rp_pi_mocks_kernels.c.src | 84 +++--- .../countpairs_s_mu_mocks_impl.c.src | 10 +- mocks/python_bindings/_countpairs_mocks.c | 34 +-- mocks/tests/tests_mocks.c | 3 +- rules.mk | 2 +- theory.options | 3 +- theory/DDsmu/Makefile | 9 +- theory/DDsmu/countpairs_s_mu_impl.c.src | 20 +- theory/DDsmu/countpairs_s_mu_impl.h.src | 4 +- theory/DDsmu/countpairs_s_mu_kernels.c.src | 29 ++- theory/python_bindings/_countpairs.c | 19 +- theory/tests/Makefile | 5 +- theory/tests/test_nonperiodic.c | 3 +- theory/tests/test_periodic.c | 2 +- utils/Makefile | 4 +- utils/avx_calls.h | 65 ++++- utils/defs.h | 20 +- 27 files changed, 396 insertions(+), 263 deletions(-) diff --git a/.travis.yml b/.travis.yml index ee991793..4b5f61ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,7 +43,6 @@ matrix: # - brew outdated xctool || brew upgrade xctool # - brew tap homebrew/versions && brew install clang-omp # - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh - - os: osx osx_image: xcode9 compiler: clang @@ -66,12 +65,6 @@ matrix: before_install: - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh - # - os: osx - # osx_image: xcode6.4 - # compiler: clang - # env: COMPILER=clang FAMILY=clang V='Apple LLVM 7.0.0' PYTHON_VERSION=2.6 NUMPY_VERSION=1.7 DOCTEST=FALSE - # before_install: - # - wget http://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh # - os: osx # compiler: gcc diff --git a/CHANGES.rst b/CHANGES.rst index cf9f96d4..7e08ec6c 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,16 +7,25 @@ New features ------------ -- New pair counter `DD(s, mu)` for theory and mocks - conda installable package +- GPU version 2.1.0 ======= +New features +------------ +- New pair counter `DD(s, mu)` for theory and mocks (contributed by @nickhand, + in #130 and #132) [#166] + + Enhancements ------------ - GSL version now specified and tested by Travis [#164] +- Now possible to specify the number of Newton-Raphson steps to +improve accuracy of approximate reciprocals. Available in `DD(rp, pi)` for mocks, +and `DD(s, mu)` for both theory and mocks 2.0.0 diff --git a/Corrfunc/mocks/DDrppi_mocks.py b/Corrfunc/mocks/DDrppi_mocks.py index e98a8be6..dd52fa6e 100644 --- a/Corrfunc/mocks/DDrppi_mocks.py +++ b/Corrfunc/mocks/DDrppi_mocks.py @@ -19,9 +19,9 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile, RA2=None, DEC2=None, CZ2=None, weights2=None, is_comoving_dist=False, verbose=False, output_rpavg=False, - fast_divide=False, xbin_refine_factor=2, - ybin_refine_factor=2, zbin_refine_factor=1, - max_cells_per_dim=100, + fast_divide_and_NR_steps=0, + xbin_refine_factor=2, ybin_refine_factor=2, + zbin_refine_factor=1, max_cells_per_dim=100, c_api_timer=False, isa=r'fastest', weight_type=None): """ Calculate the 2-D pair-counts corresponding to the projected correlation @@ -169,12 +169,13 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile, suffer from numerical loss of precision and can not be trusted. If you need accurate ``rpavg`` values, then pass in double precision arrays for the particle positions. - - fast_divide : boolean (default false) - Boolean flag to replace the division in ``AVX`` implementation with an - approximate reciprocal, followed by two Newton-Raphson steps. Improves - runtime by ~15-20%. Loss of precision is at the 5-6th decimal place. + fast_divide_and_NR_steps: integer (default 0) + Replaces the division in ``AVX`` implementation with an approximate + reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson. + Can improve runtime by ~15-20% on older computers. Value of 0 uses + the standard division operation. + (xyz)bin_refine_factor : integer, default is (2,2,1); typically within [1-3] Controls the refinement on the cell sizes. Can have up to a 20% impact on runtime. @@ -366,7 +367,7 @@ def DDrppi_mocks(autocorr, cosmology, nthreads, pimax, binfile, is_comoving_dist=is_comoving_dist, verbose=verbose, output_rpavg=output_rpavg, - fast_divide=fast_divide, + fast_divide_and_NR_steps=fast_divide_and_NR_steps, xbin_refine_factor=xbin_refine_factor, ybin_refine_factor=ybin_refine_factor, zbin_refine_factor=zbin_refine_factor, diff --git a/Corrfunc/mocks/DDsmu_mocks.py b/Corrfunc/mocks/DDsmu_mocks.py index f4da2616..16aa2def 100755 --- a/Corrfunc/mocks/DDsmu_mocks.py +++ b/Corrfunc/mocks/DDsmu_mocks.py @@ -18,9 +18,9 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile, RA2=None, DEC2=None, CZ2=None, weights2=None, is_comoving_dist=False, verbose=False, output_savg=False, - fast_divide=False, xbin_refine_factor=2, - ybin_refine_factor=2, zbin_refine_factor=1, - max_cells_per_dim=100, + fast_divide_and_NR_steps=0, + xbin_refine_factor=2, ybin_refine_factor=2, + zbin_refine_factor=1, max_cells_per_dim=100, c_api_timer=False, isa='fastest', weight_type=None): """ Calculate the 2-D pair-counts corresponding to the projected correlation @@ -121,10 +121,11 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile, co-moving distance, rather than `cz`. weights1: array_like, real (float/double), optional - A scalar, or an array of weights of shape (n_weights, n_positions) or (n_positions,). - `weight_type` specifies how these weights are used; results are returned - in the `weightavg` field. If only one of weights1 and weights2 is - specified, the other will be set to uniform weights. + A scalar, or an array of weights of shape (n_weights, n_positions) + or (n_positions,). `weight_type` specifies how these weights are used; + results are returned in the `weightavg` field. If only one of + ``weights1`` or ``weights2`` is specified, the other will be set + to uniform weights. RA2: array-like, real (float/double) The array of Right Ascensions for the second set of points. RA's @@ -171,11 +172,12 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile, values, then pass in double precision arrays for the particle positions. - fast_divide: boolean (default false) - Boolean flag to replace the division in ``AVX`` implementation with an - approximate reciprocal, followed by a Newton-Raphson step. Improves - runtime by ~15-20%. Loss of precision is at the 5-6th decimal place. - + fast_divide_and_NR_steps: integer (default 0) + Replaces the division in ``AVX`` implementation with an approximate + reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson. + Can improve runtime by ~15-20% on older computers. Value of 0 uses + the standard division operation. + (xyz)bin_refine_factor: integer, default is (2,2,1); typically within [1-3] Controls the refinement on the cell sizes. Can have up to a 20% impact on runtime. @@ -290,7 +292,7 @@ def DDsmu_mocks(autocorr, cosmology, nthreads, mu_max, nmu_bins, binfile, is_comoving_dist=is_comoving_dist, verbose=verbose, output_savg=output_savg, - fast_divide=fast_divide, + fast_divide_and_NR_steps=fast_divide_and_NR_steps, xbin_refine_factor=xbin_refine_factor, ybin_refine_factor=ybin_refine_factor, zbin_refine_factor=zbin_refine_factor, diff --git a/Corrfunc/theory/DDsmu.py b/Corrfunc/theory/DDsmu.py index b8478f80..17659739 100644 --- a/Corrfunc/theory/DDsmu.py +++ b/Corrfunc/theory/DDsmu.py @@ -14,11 +14,12 @@ def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None, - periodic=True, X2=None, Y2=None, Z2=None, weights2=None, - verbose=False, boxsize=0.0, output_savg=False, - xbin_refine_factor=2, ybin_refine_factor=2, - zbin_refine_factor=1, max_cells_per_dim=100, - c_api_timer=False, isa=r'fastest', weight_type=None): + periodic=True, X2=None, Y2=None, Z2=None, weights2=None, + verbose=False, boxsize=0.0, output_savg=False, + fast_divide_and_NR_steps=0, + xbin_refine_factor=2, ybin_refine_factor=2, + zbin_refine_factor=1, max_cells_per_dim=100, + c_api_timer=False, isa=r'fastest', weight_type=None): """ Calculate the 2-D pair-counts corresponding to the redshift-space correlation function, :math:`\\xi(s, \mu)` Pairs which are separated @@ -111,6 +112,12 @@ def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=No precision and can not be trusted. If you need accurate ``s`` values, then pass in double precision arrays for the particle positions. + fast_divide_and_NR_steps: integer (default 0) + Replaces the division in ``AVX`` implementation with an approximate + reciprocal, followed by ``fast_divide_and_NR_steps`` of Newton-Raphson. + Can improve runtime by ~15-20% on older computers. Value of 0 uses + the standard division operation. + (xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) Controls the refinement on the cell sizes. Can have up to a 20% impact on runtime. @@ -283,6 +290,7 @@ def DDsmu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=No verbose=verbose, boxsize=boxsize, output_savg=output_savg, + fast_divide_and_NR_steps=fast_divide_and_NR_steps, xbin_refine_factor=xbin_refine_factor, ybin_refine_factor=ybin_refine_factor, zbin_refine_factor=zbin_refine_factor, diff --git a/common.mk b/common.mk index 455e2187..b2da73f2 100644 --- a/common.mk +++ b/common.mk @@ -245,17 +245,11 @@ ifeq ($(DO_CHECKS), 1) $(error $(ccred) DOUBLE_PREC must be enabled with OUTPUT_THETAAVG -- loss of precision will give you incorrect results for the outer bins (>=20-30 million pairs) $(ccreset)) endif endif - - # ifeq (FAST_DIVIDE,$(findstring FAST_DIVIDE,$(OPT))) - # ifneq (USE_AVX,$(findstring USE_AVX,$(OPT))) - # $(warning Makefile option $(ccblue)"FAST_DIVIDE"$(ccreset) will not do anything unless $(ccblue)USE_AVX$(ccreset) is set) - # endif - # endif ## done with check for conflicting options ifeq (icc,$(findstring icc,$(CC))) CFLAGS += -xhost -opt-prefetch -opt-prefetch-distance=16 #-vec-report6 - ifeq (USE_OMP,$(findstring USE_OMP,$(OPT))) + ifeq (USE_OMP,$(findstring USE_OMP,$(OPT))) CFLAGS += -openmp CLINK += -openmp endif ##openmp with icc @@ -376,123 +370,140 @@ ifeq ($(DO_CHECKS), 1) # All of the python/numpy checks follow export PYTHON_CHECKED ?= 0 export NUMPY_CHECKED ?= 0 + export COMPILE_PYTHON_EXT ?= 0 ifeq ($(PYTHON_CHECKED), 0) - export COMPILE_PYTHON_EXT := 1 - export PYTHON_VERSION_FULL := $(wordlist 2,4,$(subst ., ,$(shell $(PYTHON) --version 2>&1))) - export PYTHON_VERSION_MAJOR := $(word 1,${PYTHON_VERSION_FULL}) - export PYTHON_VERSION_MINOR := $(word 2,${PYTHON_VERSION_FULL}) - - ## I only need this so that I can print out the full python version (correctly) - ## in case of error - PYTHON_VERSION_PATCH := $(word 3,${PYTHON_VERSION_FULL}) - - ## Check numpy version - export NUMPY_VERSION_FULL := $(wordlist 1,3,$(subst ., ,$(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print(numpy.__version__)"))) - export NUMPY_VERSION_MAJOR := $(word 1,${NUMPY_VERSION_FULL}) - export NUMPY_VERSION_MINOR := $(word 2,${NUMPY_VERSION_FULL}) - - ## Same reason as python patch level. - NUMPY_VERSION_PATCH := $(word 3,${NUMPY_VERSION_FULL}) - - ### Check for minimum python + numpy versions. In theory, I should also check - ### that *any* python and numpy are available but that seems too much effort - MIN_PYTHON_MAJOR := 2 - MIN_PYTHON_MINOR := 6 - - MIN_NUMPY_MAJOR := 1 - MIN_NUMPY_MINOR := 7 - - PYTHON_AVAIL := $(shell [ $(PYTHON_VERSION_MAJOR) -gt $(MIN_PYTHON_MAJOR) -o \( $(PYTHON_VERSION_MAJOR) -eq $(MIN_PYTHON_MAJOR) -a $(PYTHON_VERSION_MINOR) -ge $(MIN_PYTHON_MINOR) \) ] && echo true) - NUMPY_AVAIL := $(shell [ $(NUMPY_VERSION_MAJOR) -gt $(MIN_NUMPY_MAJOR) -o \( $(NUMPY_VERSION_MAJOR) -eq $(MIN_NUMPY_MAJOR) -a $(NUMPY_VERSION_MINOR) -ge $(MIN_NUMPY_MINOR) \) ] && echo true) + # This is very strange -- requested 'version' info goes to stderr!! + # anything user-requested should always go to stdout IMHO -- MS 17/8/2018 + # Only stdout is passed back as the output; therefore need to redirect + # stderr to stdout, and then capture that output to `PYTHON_FOUND` + PYTHON_FOUND := $(shell $(PYTHON) --version 2>&1)) + PYTHON_CHECKED := 1 + ifdef PYTHON_FOUND + export PYTHON_VERSION_FULL := $(wordlist 2,4,$(subst ., ,${PYTHON_FOUND})) + export PYTHON_VERSION_MAJOR := $(word 1,${PYTHON_VERSION_FULL}) + export PYTHON_VERSION_MINOR := $(word 2,${PYTHON_VERSION_FULL}) + + ## I only need this so that I can print out the full python version (correctly) + ## in case of error + PYTHON_VERSION_PATCH := $(word 3,${PYTHON_VERSION_FULL}) + + ## Check numpy version + export NUMPY_VERSION_FULL := $(wordlist 1,3,$(subst ., ,$(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print(numpy.__version__)"))) + export NUMPY_VERSION_MAJOR := $(word 1,${NUMPY_VERSION_FULL}) + export NUMPY_VERSION_MINOR := $(word 2,${NUMPY_VERSION_FULL}) + + ## Same reason as python patch level. + NUMPY_VERSION_PATCH := $(word 3,${NUMPY_VERSION_FULL}) + + ### Check for minimum python + numpy versions. In theory, I should also check + ### that *any* python and numpy are available but that seems too much effort + MIN_PYTHON_MAJOR := 2 + MIN_PYTHON_MINOR := 6 + + MIN_NUMPY_MAJOR := 1 + MIN_NUMPY_MINOR := 7 + + PYTHON_AVAIL := $(shell [ $(PYTHON_VERSION_MAJOR) -gt $(MIN_PYTHON_MAJOR) -o \( $(PYTHON_VERSION_MAJOR) -eq $(MIN_PYTHON_MAJOR) -a $(PYTHON_VERSION_MINOR) -ge $(MIN_PYTHON_MINOR) \) ] && echo true) + NUMPY_AVAIL := $(shell [ $(NUMPY_VERSION_MAJOR) -gt $(MIN_NUMPY_MAJOR) -o \( $(NUMPY_VERSION_MAJOR) -eq $(MIN_NUMPY_MAJOR) -a $(NUMPY_VERSION_MINOR) -ge $(MIN_NUMPY_MINOR) \) ] && echo true) + + ifeq ($(PYTHON_AVAIL),true) + ifeq ($(NUMPY_AVAIL),true) + export COMPILE_PYTHON_EXT := 1 + endif + endif - ifneq ($(PYTHON_AVAIL),true) - $(warning $(ccmagenta) Found python version $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR).$(PYTHON_VERSION_PATCH) but minimum required python is $(MIN_PYTHON_MAJOR).$(MIN_PYTHON_MINOR) $(ccreset)) - COMPILE_PYTHON_EXT := 0 - endif + ifneq ($(PYTHON_AVAIL),true) + $(warning $(ccmagenta) Found python version $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR).$(PYTHON_VERSION_PATCH) but minimum required python is $(MIN_PYTHON_MAJOR).$(MIN_PYTHON_MINOR) $(ccreset)) + export COMPILE_PYTHON_EXT := 0 + endif - ifneq ($(NUMPY_AVAIL),true) - $(warning $(ccmagenta) Found NUMPY version $(NUMPY_VERSION_MAJOR).$(NUMPY_VERSION_MINOR).$(NUMPY_VERSION_PATCH) but minimum required numpy is $(MIN_NUMPY_MAJOR).$(MIN_NUMPY_MINOR) $(ccreset)) - COMPILE_PYTHON_EXT := 0 - endif + ifneq ($(NUMPY_AVAIL),true) + $(warning $(ccmagenta) Found NUMPY version $(NUMPY_VERSION_MAJOR).$(NUMPY_VERSION_MINOR).$(NUMPY_VERSION_PATCH) but minimum required numpy is $(MIN_NUMPY_MAJOR).$(MIN_NUMPY_MINOR) $(ccreset)) + export COMPILE_PYTHON_EXT := 0 + endif - ifneq ($(COMPILE_PYTHON_EXT), 0) - ifndef PYTHON_CONFIG_EXE - ifeq ($(PYTHON_VERSION_MAJOR), 2) - PYTHON_CONFIG_EXE:=python-config - else - PYTHON_CONFIG_EXE:=python3-config + ifneq ($(COMPILE_PYTHON_EXT), 0) + ifndef PYTHON_CONFIG_EXE + ifeq ($(PYTHON_VERSION_MAJOR), 2) + PYTHON_CONFIG_EXE:=python-config + else + PYTHON_CONFIG_EXE:=python3-config + endif + ifneq ($(PYTHON), python) + PYTHON_CONFIG_EXE:=$(dir $(PYTHON))$(PYTHON_CONFIG_EXE) + $(warning $(ccblue)"PYTHON"$(ccreset) is set to $(ccblue)$(PYTHON)$(ccreset); using $(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset) as $(ccblue)python-config$(ccreset). If this is not correct, please also set $(ccblue)"PYTHON_CONFIG_EXE"$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset)) + endif endif - ifneq ($(PYTHON), python) - PYTHON_CONFIG_EXE:=$(dir $(PYTHON))$(PYTHON_CONFIG_EXE) - $(warning $(ccblue)"PYTHON"$(ccreset) is set to $(ccblue)$(PYTHON)$(ccreset); using $(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset) as $(ccblue)python-config$(ccreset). If this is not correct, please also set $(ccblue)"PYTHON_CONFIG_EXE"$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset)) + PYTHON_CONFIG_INCL := $(shell $(PYTHON_CONFIG_EXE) --includes 2>/dev/null) + ifndef PYTHON_CONFIG_INCL + $(error $(ccred)python-config$(ccreset) ($(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset)) not found. Please set $(ccgreen)PYTHON_CONFIG_EXE$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset) before installing $(DISTNAME).$(VERSION). Installing $(ccblue)python-devel$(ccreset) might fix this issue $(ccreset)) endif - endif - PYTHON_CONFIG_INCL := $(shell $(PYTHON_CONFIG_EXE) --includes 2>/dev/null) - ifndef PYTHON_CONFIG_INCL - $(error $(ccred)python-config$(ccreset) ($(ccblue)$(PYTHON_CONFIG_EXE)$(ccreset)) not found. Please set $(ccgreen)PYTHON_CONFIG_EXE$(ccreset) in $(ccgreen)"common.mk"$(ccreset) to appropriate $(ccblue)python-config$(ccreset) before installing $(DISTNAME).$(VERSION). Installing $(ccblue)python-devel$(ccreset) might fix this issue $(ccreset)) - endif - PYTHON_CONFIG_INCL:=$(patsubst -I%,-isystem%, $(PYTHON_CONFIG_INCL)) - - # NUMPY is available -> next step should not fail - # That's why we are not checking if the NUMPY_INCL_FLAG is defined. - ifeq ($(NUMPY_CHECKED), 0) - export NUMPY_INCL_FLAG := $(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print('-isystem ' + numpy.__path__[0] + '/core/include/numpy/')") - # Take the second word -> the path (the first word is "isystem") - NUMPY_INCL_PATH := $(word 2, ${NUMPY_INCL_FLAG}) - # Now check that the 'arrayobject.h' file is present in the - # supposed numpy directory. Otherwise, compilation will fail. - # The absence of the file likely indicates a missing numpy-devel - # package (see issue #134 on github) - NUMPY_NEEDED_HEADER_FILE := ${NUMPY_INCL_PATH}arrayobject.h - ifeq (,$(wildcard ${NUMPY_NEEDED_HEADER_FILE})) - $(error Required $(ccred)numpy headers$(ccreset) are missing...stopping the compilation. You might be able to fix this by installing $(ccblue)numpy-devel$(ccreset)) + PYTHON_CONFIG_INCL:=$(patsubst -I%,-isystem%, $(PYTHON_CONFIG_INCL)) + + # NUMPY is available -> next step should not fail + # That's why we are not checking if the NUMPY_INCL_FLAG is defined. + ifeq ($(NUMPY_CHECKED), 0) + export NUMPY_INCL_FLAG := $(shell $(PYTHON) -c "from __future__ import print_function; import numpy; print('-isystem ' + numpy.__path__[0] + '/core/include/numpy/')") + # Take the second word -> the path (the first word is "isystem") + NUMPY_INCL_PATH := $(word 2, ${NUMPY_INCL_FLAG}) + # Now check that the 'arrayobject.h' file is present in the + # supposed numpy directory. Otherwise, compilation will fail. + # The absence of the file likely indicates a missing numpy-devel + # package (see issue #134 on github) + NUMPY_NEEDED_HEADER_FILE := ${NUMPY_INCL_PATH}arrayobject.h + ifeq (,$(wildcard ${NUMPY_NEEDED_HEADER_FILE})) + $(error Required $(ccred)numpy headers$(ccreset) are missing...stopping the compilation. You might be able to fix this by installing $(ccblue)numpy-devel$(ccreset)) + endif + export NUMPY_CHECKED:=1 endif - export NUMPY_CHECKED:=1 - endif - export PYTHON_CFLAGS := $(PYTHON_CONFIG_INCL) $(NUMPY_INCL_FLAG) - export PYTHON_LIBDIR := $(shell $(PYTHON_CONFIG_EXE) --prefix)/lib - export PYTHON_LIBS := $(shell $(PYTHON_CONFIG_EXE) --libs) - export PYTHON_LINK := - # export PYTHON_LINK := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR) - # export PYTHON_LINK := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR) - SOABI := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; print(sysconfig.get_config_var('SOABI'))" 2>/dev/null) - export PYTHON_SOABI := - ifdef SOABI - ifneq ($(SOABI), None) - PYTHON_SOABI = .$(SOABI) + export PYTHON_CFLAGS := $(PYTHON_CONFIG_INCL) $(NUMPY_INCL_FLAG) + export PYTHON_LIBDIR := $(shell $(PYTHON_CONFIG_EXE) --prefix)/lib + export PYTHON_LIBS := $(shell $(PYTHON_CONFIG_EXE) --libs) + export PYTHON_LINK := + # export PYTHON_LINK := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR) + # export PYTHON_LINK := -L$(PYTHON_LIBDIR) $(PYTHON_LIBS) -Xlinker -rpath -Xlinker $(PYTHON_LIBDIR) + SOABI := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; print(sysconfig.get_config_var('SOABI'))" 2>/dev/null) + export PYTHON_SOABI := + ifdef SOABI + ifneq ($(SOABI), None) + PYTHON_SOABI = .$(SOABI) + endif endif - endif - export PYTHON_SOABI - # export PYTHON_LIB_BASE := $(strip $(subst -l,lib, $(filter -lpython%,$(PYTHON_LIBS)))) - - ### Check if conda is being used on OSX - then we need to fix python link libraries - export FIX_PYTHON_LINK := 0 - # ifeq ($(CONDA_BUILD), 0) - # ## Check if conda build is under progress -> do nothing in that case. Let conda handle it - # ifeq ($(UNAME), Darwin) - # PATH_TO_PYTHON := $(shell which python) - # ifeq (conda, $(findstring conda, $(PATH_TO_PYTHON))) - # FIX_PYTHON_LINK := 1 - # endif - # endif - # endif - ifeq ($(UNAME), Darwin) - # PYTHON_LINK := $(filter-out -framework, $(PYTHON_LINK)) - # PYTHON_LINK := $(filter-out -ldl, $(PYTHON_LINK)) - # PYTHON_LINK := $(filter-out CoreFoundation, $(PYTHON_LINK)) - PYTHON_LINK += -dynamiclib -Wl,-compatibility_version,$(ABI_COMPAT_VERSION) -Wl,-current_version,$(VERSION) -undefined dynamic_lookup - PYTHON_LINK += -headerpad_max_install_names - - ### Another check for stack-size. travis ci chokes on this with gcc - # comma := , - # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) - # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) - # PYTHON_LINK := $(filter-out -stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) - endif #Darwin checks - export PYTHON_CHECKED:=1 - endif # compile python extensions - endif + export PYTHON_SOABI + # export PYTHON_LIB_BASE := $(strip $(subst -l,lib, $(filter -lpython%,$(PYTHON_LIBS)))) + + ### Check if conda is being used on OSX - then we need to fix python link libraries + export FIX_PYTHON_LINK := 0 + # ifeq ($(CONDA_BUILD), 0) + # ## Check if conda build is under progress -> do nothing in that case. Let conda handle it + # ifeq ($(UNAME), Darwin) + # PATH_TO_PYTHON := $(shell which python) + # ifeq (conda, $(findstring conda, $(PATH_TO_PYTHON))) + # FIX_PYTHON_LINK := 1 + # endif + # endif + # endif + ifeq ($(UNAME), Darwin) + # PYTHON_LINK := $(filter-out -framework, $(PYTHON_LINK)) + # PYTHON_LINK := $(filter-out -ldl, $(PYTHON_LINK)) + # PYTHON_LINK := $(filter-out CoreFoundation, $(PYTHON_LINK)) + PYTHON_LINK += -dynamiclib -Wl,-compatibility_version,$(ABI_COMPAT_VERSION) -Wl,-current_version,$(VERSION) -undefined dynamic_lookup + PYTHON_LINK += -headerpad_max_install_names + + ### Another check for stack-size. travis ci chokes on this with gcc + # comma := , + # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) + # PYTHON_LINK := $(filter-out -Wl$(comma)-stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) + # PYTHON_LINK := $(filter-out -stack_size$(comma)1000000$(comma), $(PYTHON_LINK)) + endif #Darwin checks + export PYTHON_FOUND :=1 + endif # compile python extensions + else + $(warning There was an error running python -- currently set to $(ccblue)[${PYTHON}]$(ccreset)) + $(warning Skipping the creation of python bindings) + endif ## ifdef PYTHON_FOUND + endif ## PYTHON_CHECKED ### Done with python checks diff --git a/docs/source/conf.py b/docs/source/conf.py index 1b5f3406..ad17b7db 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -175,7 +175,8 @@ def __getattr__(cls, name): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] +html_static_path = [] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/mocks.options b/mocks.options index e82ce2e6..e95844cb 100644 --- a/mocks.options +++ b/mocks.options @@ -1,6 +1,6 @@ -### Special option for DDrppi_mocks +### Special option for DDrppi_mocks/DDsmu_mocks OPT += -DOUTPUT_RPAVG ### Enabling this DOES NOT cause too much of a runtime-hit for DDrppi (<= 10% performance hit) -#OPT += -DFAST_DIVIDE ##replaces divide in DDrppi with approximate divides. If you really must get that extra ~20% performance boost +#OPT += -DFAST_DIVIDE=2 ##replaces a divide with approximate reciprocals, followed by 'FAST_DIVIDE' number of Newton-Raphson steps. Trade-off between speed and accuracy; may be slower on newer computers ### Specific options for wtheta (DDtheta_mocks.c) #OPT += -DOUTPUT_THETAAVG diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 46c25013..1618906c 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -220,6 +220,12 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE if(options->c_api_timer) { gettimeofday(&t0, NULL); } + if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) { + fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u." + " Switching to a standard divide"ANSI_COLOR_RESET"\n", + options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS); + options->fast_divide_and_NR_steps = 0; + } //Check inputs if(ND1 == 0 || (autocorr == 0 && ND2 == 0)) { @@ -604,7 +610,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE const int status = countpairs_rp_pi_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1, N1, x1, y1, z1, d1, weights1, same_cell, - options->fast_divide, + options->fast_divide_and_NR_steps, sqr_rpmax, sqr_rpmin, nrpbin, npibin, rupp_sqr, pimax,max_sep, this_rpavg, npairs, @@ -632,7 +638,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE const int status = countpairs_rp_pi_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1, N2, x2, y2, z2, d2, weights2, same_cell, - options->fast_divide, + options->fast_divide_and_NR_steps, sqr_rpmax, sqr_rpmin, nrpbin, npibin, rupp_sqr, pimax,max_sep, this_rpavg, npairs, diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src index 91ca2777..14bc560a 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.h.src @@ -24,7 +24,7 @@ extern "C" { typedef int (*countpairs_mocks_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1, const int same_cell, - const int fast_divide, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep, DOUBLE *src_rpavg, uint64_t *src_npairs, diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src index dde05f57..b2e8fad0 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_kernels.c.src @@ -26,7 +26,7 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1, const int same_cell, - const int fast_divide, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep, DOUBLE *src_rpavg, @@ -177,7 +177,7 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, AVX_FLOATS m_sqr_Dpar, m_sqr_Dperp; { - const AVX_FLOATS m_dsep = AVX_SUBTRACT_FLOATS(AVX_SQUARE_FLOAT(m_d2), AVX_SQUARE_FLOAT(m_dpos)); + const AVX_FLOATS m_s_dot_l = AVX_SUBTRACT_FLOATS(AVX_SQUARE_FLOAT(m_d2), AVX_SQUARE_FLOAT(m_dpos)); /* const AVX_FLOATS m_dz_mask = AVX_COMPARE_FLOATS(m_perpz, m_max_sep, _CMP_LT_OQ); */ /* if(AVX_TEST_COMPARISON(m_dz_mask) == 0) { */ @@ -185,12 +185,12 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, /* break; */ /* } */ - const AVX_FLOATS m_numerator = AVX_SQUARE_FLOAT(m_dsep); + const AVX_FLOATS m_sqr_s_dot_l = AVX_SQUARE_FLOAT(m_s_dot_l); const AVX_FLOATS m_sqr_perpx = AVX_SQUARE_FLOAT(m_perpx); const AVX_FLOATS m_sqr_perpy = AVX_SQUARE_FLOAT(m_perpy); const AVX_FLOATS m_sqr_perpz = AVX_SQUARE_FLOAT(m_perpz); const AVX_FLOATS m_sqr_sep = AVX_ADD_FLOATS(m_sqr_perpx, AVX_ADD_FLOATS(m_sqr_perpy, m_sqr_perpz));//3-d separation - + //The 3-d separation (| s.s |)^2 *must* be less than (pimax^2 + rpmax^2). If not, one of the //constraints for counting the pair (i.e., rp < rpmax, \pi < pimax) must be violated and //we would discard the pair. @@ -201,53 +201,33 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, //However, division is slow -> so we will check if \pimax^2 * |l| ^2 < |s.l|^2. If not, then the //value of \pi (after division) *must* be larger than \pimax -> in which case we would //not count that pair anway. - const AVX_FLOATS m_sqr_pimax_times_l = AVX_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l); - const AVX_FLOATS m_mask_pimax_sep = AVX_COMPARE_FLOATS(m_numerator, m_sqr_pimax_times_l, _CMP_LT_OQ);// is pi < pimax ? + const AVX_FLOATS m_sqr_pimax_times_sqr_l = AVX_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l); + const AVX_FLOATS m_mask_pimax_sep = AVX_COMPARE_FLOATS(m_sqr_s_dot_l, m_sqr_pimax_times_sqr_l, _CMP_LT_OQ);// is pi < pimax ? + +#if 0 + /* This could have been an additional check before the divide but does not seem to boost performance */ + const AVX_FLOATS m_sqr_rpmax_times_sqr_l = AVX_MULTIPLY_FLOATS(m_sqr_rpmax, m_sqr_norm_l); + const AVX_FLOATS m_sqr_s_minus_sqr_sdotl = AVX_SUBTRACT_FLOATS(m_sqr_sep, m_sqr_s_dot_l); + const AVX_FLOATS m_sqr_rpmax_initial_mask = AVX_COMPARE_FLOATS(m_sqr_s_minus_sqr_sdotl, m_sqr_rpmax_times_sqr_l, _CMP_LT_OQ);/* is rp < rpmax */ + //If the bits are all 0, then *none* of the pairs satisfy the pimax + rpmax constraints. + const AVX_FLOATS m_mask = AVX_BITWISE_AND(AVX_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep), m_sqr_rpmax_initial_mask); +#else const AVX_FLOATS m_mask = AVX_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep); +#endif + if(AVX_TEST_COMPARISON(m_mask)==0) { continue; } - if(fast_divide == 0) { - //regular division -> slow op - m_sqr_Dpar = AVX_DIVIDE_FLOATS(m_numerator,m_sqr_norm_l); - //The divide is the actual operation we need - // but divides are about 10x slower than multiplies. So, I am replacing it - //with a approximate reciprocal in floating point - // + 2 iterations of newton-raphson in case of DOUBLE - } else { - //following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson - //However, the exact implementation depends on the precision. floats have an inbuilt approx. reciprocal - //but doubles do not. So, we have to 'fake' an approximate reciprocal for doubles by converting to float - //taking the approximate reciprocal, and then convert back to double -#ifndef DOUBLE_PREC - const AVX_FLOATS rc = _mm256_rcp_ps(m_sqr_norm_l);//intrinsic for 256 bit approximate reciprocal -#else - //we have to do this for doubles now. - //if the vrcpps instruction is not generated, there will - //be a ~70 cycle performance hit from switching between - //AVX and SSE modes. - const __m128 float_tmp1 = _mm256_cvtpd_ps(m_sqr_norm_l);//convert double to float -> not avx_floats := _m256d - //(convert 4 doubles into 4 floats -> use half of available 256 bit SIMD registers) - __m128 float_inv_tmp1 = _mm_rcp_ps(float_tmp1);//intrinsic for 128 bit float approximate reciprocal - const AVX_FLOATS rc = _mm256_cvtps_pd(float_inv_tmp1);//convert back to double -#endif//DOUBLE_PREC - - //We have the double->float->approx. reciprocal->double process done. - //Now improve the accuracy of the divide with newton-raphson. - - //Ist iteration of NewtonRaphson - const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0); - const AVX_FLOATS rc1 = AVX_MULTIPLY_FLOATS(rc, - AVX_SUBTRACT_FLOATS(two, - AVX_MULTIPLY_FLOATS(m_sqr_norm_l,rc))); - //2nd iteration of NewtonRaphson - const AVX_FLOATS rc2 = AVX_MULTIPLY_FLOATS(rc1, - AVX_SUBTRACT_FLOATS(two, - AVX_MULTIPLY_FLOATS(m_sqr_norm_l,rc1))); - m_sqr_Dpar = AVX_MULTIPLY_FLOATS(m_numerator,rc2); - }//end of FAST_DIVIDE + + /* Check if fast_divide is enabled and either use the normal divide or + use the approx. reciprocal followed by `fast_divide_and_NR_steps` number + of Newton-Raphson steps to improve numerical accuracy. + + macro is defined in `avx_calls.h` + */ + CHECK_AND_FAST_DIVIDE(m_sqr_Dpar, m_sqr_s_dot_l, m_sqr_norm_l, fast_divide_and_NR_steps); m_sqr_Dperp = AVX_SUBTRACT_FLOATS(m_sqr_sep,m_sqr_Dpar); } @@ -413,7 +393,7 @@ static inline int countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE(const int64_t N0, static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1, const int same_cell, - const int fast_divide, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep, DOUBLE *src_rpavg, @@ -429,7 +409,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0, const int32_t need_rpavg = src_rpavg != NULL; const int32_t need_weightavg = src_weightavg != NULL; - (void) fast_divide; //unused + (void) fast_divide_and_NR_steps; //unused SSE_FLOATS m_rupp_sqr[nbin]; for(int i=0;i in which case we would //not count that pair anway. const SSE_FLOATS m_sqr_pimax_times_l = SSE_MULTIPLY_FLOATS(m_sqr_pimax, m_sqr_norm_l); - const SSE_FLOATS m_mask_pimax_sep = SSE_COMPARE_FLOATS_LT(m_numerator, m_sqr_pimax_times_l);// is pi < pimax ? + const SSE_FLOATS m_mask_pimax_sep = SSE_COMPARE_FLOATS_LT(m_sqr_s_dot_l, m_sqr_pimax_times_l);// is pi < pimax ? //If the bits are all 0, then *none* of the pairs satisfy the pimax + rpmax constraints. const SSE_FLOATS m_mask = SSE_BITWISE_AND(m_mask_3d_sep, m_mask_pimax_sep); if(SSE_TEST_COMPARISON(m_mask)==0) { continue; } - m_sqr_Dpar = SSE_DIVIDE_FLOATS(m_numerator,m_sqr_norm_l); + m_sqr_Dpar = SSE_DIVIDE_FLOATS(m_sqr_s_dot_l,m_sqr_norm_l); //The divide is the actual operation we need // but divides are about 10x slower than multiplies. m_sqr_Dperp = SSE_SUBTRACT_FLOATS(m_sqr_sep,m_sqr_Dpar); @@ -764,7 +744,7 @@ static inline int countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE(const int64_t N0, static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, DOUBLE *d0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, DOUBLE *d1, const weight_struct_DOUBLE *weights1, const int same_cell, - const int fast_divide, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_rpmax, const DOUBLE sqr_rpmin, const int nbin, const int npibin, const DOUBLE *rupp_sqr, const DOUBLE pimax, const DOUBLE max_sep, DOUBLE *src_rpavg, uint64_t *src_npairs, @@ -781,7 +761,7 @@ static inline int countpairs_rp_pi_mocks_fallback_DOUBLE(const int64_t N0, DOUBL const int32_t need_rpavg = src_rpavg != NULL; const int32_t need_weightavg = src_weightavg != NULL; - (void) fast_divide;//unused parameter but required to keep the same function signature amongst the kernels + (void) fast_divide_and_NR_steps;//unused parameter but required to keep the same function signature amongst the kernels /*----------------- FALLBACK CODE --------------------*/ const int64_t totnbins = (npibin+1)*(nbin+1); diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index b4fbf26a..7443edf8 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -221,6 +221,12 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D if(options->c_api_timer) { gettimeofday(&t0, NULL); } + if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) { + fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u." + " Switching to a standard divide"ANSI_COLOR_RESET"\n", + options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS); + options->fast_divide_and_NR_steps = 0; + } //Check inputs if(ND1 == 0 || (autocorr == 0 && ND2 == 0)) { @@ -606,7 +612,7 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1, N1, x1, y1, z1, d1, weights1, same_cell, - options->fast_divide, + options->fast_divide_and_NR_steps, smax, smin, nsbin, nmu_bins, supp_sqr, mu_max, this_savg, npairs, @@ -634,7 +640,7 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D const int status = countpairs_s_mu_mocks_function_DOUBLE(N1, x1, y1, z1, d1, weights1, N2, x2, y2, z2, d2, weights2, same_cell, - options->fast_divide, + options->fast_divide_and_NR_steps, smax, smin, nsbin, nmu_bins, supp_sqr, mu_max, this_savg, npairs, diff --git a/mocks/python_bindings/_countpairs_mocks.c b/mocks/python_bindings/_countpairs_mocks.c index 2d34bcc4..2240aa6c 100644 --- a/mocks/python_bindings/_countpairs_mocks.c +++ b/mocks/python_bindings/_countpairs_mocks.c @@ -75,7 +75,7 @@ static PyMethodDef module_methods[] = { " RA2=None, DEC2=None, CZ2=None, weights2=None,\n" " is_comoving_dist=False,\n" " verbose=False, output_rpavg=False,\n" - " fast_divide=False, xbin_refine_factor=2, \n" + " fast_divide_and_NR_steps=0, xbin_refine_factor=2, \n" " ybin_refine_factor=2, zbin_refine_factor=1, \n" " max_cells_per_dim=100, \n" " c_api_timer=False, isa=-1)\n" @@ -175,10 +175,11 @@ static PyMethodDef module_methods[] = { " precision and can not be trusted. If you need accurate ``rpavg``\n" " values, then pass in double precision arrays for the particle positions.\n" "\n" - "fast_divide: boolean (default false)\n" - " Boolean flag to replace the division in ``AVX`` implementation with an\n" - " approximate reciprocal, followed by a Newton-Raphson step. Improves\n" - " runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.\n" + "fast_divide_and_NR_steps: integer (default 0)\n" + " Replaces the division in ``AVX`` implementation with an\n" + " approximate reciprocal, followed by ``fast_divide_and_NR_steps`` " + " Newton-Raphson step. Can improve \n" + " runtime by ~15-20%. Value of 0 keeps the standard division.\n" "\n" "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n" " Controls the refinement on the cell sizes. Can have up to a 20% impact \n" @@ -242,7 +243,7 @@ static PyMethodDef module_methods[] = { " RA2=None, DEC2=None, CZ2=None, weights2=None,\n" " is_comoving_dist=False,\n" " verbose=False, output_savg=False,\n" - " fast_divide=False, xbin_refine_factor=2, \n" + " fast_divide_and_NR_steps=0, xbin_refine_factor=2, \n" " ybin_refine_factor=2, zbin_refine_factor=1, \n" " max_cells_per_dim=100, \n" " c_api_timer=False, isa=-1)\n" @@ -338,10 +339,11 @@ static PyMethodDef module_methods[] = { " precision and can not be trusted. If you need accurate ``savg``\n" " values, then pass in double precision arrays for the particle positions.\n" "\n" - "fast_divide: boolean (default false)\n" - " Boolean flag to replace the division in ``AVX`` implementation with an\n" - " approximate reciprocal, followed by a Newton-Raphson step. Improves\n" - " runtime by ~15-20%. Loss of precision is at the 5-6th decimal place.\n" + "fast_divide_and_NR_steps: integer (default 0)\n" + " Replaces the division in ``AVX`` implementation with an\n" + " approximate reciprocal, followed by ``fast_divide_and_NR_steps`` " + " Newton-Raphson step. Can improve \n" + " runtime by ~15-20%. Value of 0 keeps the standard division.\n" "\n" "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n" " Controls the refinement on the cell sizes. Can have up to a 20% impact \n" @@ -1062,7 +1064,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg options.verbose = 0; options.instruction_set = -1; options.periodic = 0; - options.fast_divide=0; + options.fast_divide_and_NR_steps=0; options.c_api_timer = 0; int8_t xbin_ref=options.bin_refine_factors[0], ybin_ref=options.bin_refine_factors[1], @@ -1091,7 +1093,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg "is_comoving_dist", "verbose", /* keyword verbose -> print extra info at runtime + progressbar */ "output_rpavg", - "fast_divide", + "fast_divide_and_NR_steps", "xbin_refine_factor", "ybin_refine_factor", "zbin_refine_factor", @@ -1115,7 +1117,7 @@ static PyObject *countpairs_countpairs_rp_pi_mocks(PyObject *self, PyObject *arg &(options.is_comoving_dist), &(options.verbose), &(options.need_avg_sep), - &(options.fast_divide), + &(options.fast_divide_and_NR_steps), &xbin_ref, &ybin_ref, &zbin_ref, &(options.max_cells_per_dim), &(options.c_api_timer), @@ -1388,7 +1390,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args options.verbose = 0; options.instruction_set = -1; options.periodic = 0; - options.fast_divide=0; + options.fast_divide_and_NR_steps=0; options.c_api_timer = 0; int8_t xbin_ref=options.bin_refine_factors[0], ybin_ref=options.bin_refine_factors[1], @@ -1419,7 +1421,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args "is_comoving_dist", "verbose", /* keyword verbose -> print extra info at runtime + progressbar */ "output_savg", - "fast_divide", + "fast_divide_and_NR_steps", "xbin_refine_factor", "ybin_refine_factor", "zbin_refine_factor", @@ -1443,7 +1445,7 @@ static PyObject *countpairs_countpairs_s_mu_mocks(PyObject *self, PyObject *args &(options.is_comoving_dist), &(options.verbose), &(options.need_avg_sep), - &(options.fast_divide), + &(options.fast_divide_and_NR_steps), &xbin_ref, &ybin_ref, &zbin_ref, &(options.max_cells_per_dim), &(options.c_api_timer), diff --git a/mocks/tests/tests_mocks.c b/mocks/tests/tests_mocks.c index 2f5a15cd..61f94fd2 100644 --- a/mocks/tests/tests_mocks.c +++ b/mocks/tests/tests_mocks.c @@ -537,9 +537,8 @@ int main(int argc, char **argv) options.verbose=0; options.periodic=0; options.float_type=sizeof(double); - options.fast_divide=0; + options.fast_divide_and_NR_steps=0; options.fast_acos=0; - //options.instruction_set = FALLBACK; int status = init_cosmology(cosmology_flag); if(status != EXIT_SUCCESS) { diff --git a/rules.mk b/rules.mk index a7756045..80900870 100644 --- a/rules.mk +++ b/rules.mk @@ -56,7 +56,7 @@ $(TARGET).o: $(TARGET).c $(ROOT_DIR)/common.mk Makefile $(ROOT_DIR)/theory.optio %_float.o: %_float.c $(CC) -DNDOUBLE_PREC $(CFLAGS) $(INCLUDE) $(EXTRA_INCL) -c $< -o $@ -%.o: %.c $(ROOT_DIR)/common.mk Makefile +%.o: %.c $(ROOT_DIR)/common.mk $(ROOT_DIR)/utils/defs.h Makefile $(CC) $(CFLAGS) $(INCLUDE) $(EXTRA_INCL) -c $< -o $@ $(LIBRARY): $(LIBOBJS) $(ROOT_DIR)/mocks.options $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile diff --git a/theory.options b/theory.options index 1bc87bf5..686555b5 100644 --- a/theory.options +++ b/theory.options @@ -1,9 +1,10 @@ #### Science use-cases for Theory Correlation Functions OPT = -DPERIODIC #OPT += -DOUTPUT_RPAVG ### Enabling this can cause up to a 2x performance hit +#OPT += -DFAST_DIVIDE=2 ##replaces a divide (in DDsmu) with approximate reciprocals, followed by 'FAST_DIVIDE' number of Newton-Raphson steps. Trade-off between speed and accuracy; may be slower on newer computers #### Code specs for both theory and data Correlation Functions -OPT += -DDOUBLE_PREC +#OPT += -DDOUBLE_PREC diff --git a/theory/DDsmu/Makefile b/theory/DDsmu/Makefile index ca63851a..4c498b5a 100644 --- a/theory/DDsmu/Makefile +++ b/theory/DDsmu/Makefile @@ -25,15 +25,16 @@ INCL := countpairs_s_mu_kernels_float.c countpairs_s_mu_kernels_double.c count $(UTILS_DIR)/defs.h $(UTILS_DIR)/cpu_features.h \ $(IO_DIR)/ftread.h $(IO_DIR)/io.h $(UTILS_DIR)/utils.h $(UTILS_DIR)/progressbar.h \ $(UTILS_DIR)/weight_functions_double.h $(UTILS_DIR)/weight_functions_float.h $(UTILS_DIR)/weight_functions.h.src \ - $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src + $(UTILS_DIR)/weight_defs_double.h $(UTILS_DIR)/weight_defs_float.h $(UTILS_DIR)/weight_defs.h.src TARGETOBJS := $(TARGETSRC:.c=.o) LIBOBJS := $(LIBSRC:.c=.o) all: $(TARGETS) $(TARGETSRC) $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk Makefile -countpairs_s_mu_impl_double.o:countpairs_s_mu_impl_double.c countpairs_s_mu_impl_double.h countpairs_s_mu_kernels_double.c $(UTILS_DIR)/gridlink_impl_double.h $(UTILS_DIR)/cellarray_double.h -countpairs_s_mu_impl_float.o:countpairs_s_mu_impl_float.c countpairs_s_mu_impl_float.h countpairs_s_mu_kernels_float.c $(UTILS_DIR)/gridlink_impl_float.h $(UTILS_DIR)/cellarray_float.h -countpairs_s_mu.o:countpairs_s_mu.c countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h $(INCL) +countpairs_s_mu_impl_double.o:countpairs_s_mu_impl_double.c countpairs_s_mu_impl_double.h countpairs_s_mu_kernels_double.c $(UTILS_DIR)/gridlink_impl_double.h +countpairs_s_mu_impl_float.o:countpairs_s_mu_impl_float.c countpairs_s_mu_impl_float.h countpairs_s_mu_kernels_float.c $(UTILS_DIR)/gridlink_impl_float.h +countpairs_s_mu.o:countpairs_s_mu.c countpairs_s_mu_impl_double.h countpairs_s_mu_impl_float.h countpairs_s_mu.h $(INCL) +countpairs_s_mu_impl_float.c countpairs_s_mu_impl_double.c:countpairs_s_mu_impl.c.src $(INCL) libs: lib lib: $(LIBRARY) diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index 494a6aff..fbbd6d05 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -174,6 +174,13 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1 options->max_cells_per_dim = NLATMAX; } + if(options->fast_divide_and_NR_steps >= MAX_FAST_DIVIDE_NR_STEPS) { + fprintf(stderr, ANSI_COLOR_MAGENTA"Warning: The number of requested Newton-Raphson steps = %u is larger than max. allowed steps = %u." + " Switching to a standard divide"ANSI_COLOR_RESET"\n", + options->fast_divide_and_NR_steps, MAX_FAST_DIVIDE_NR_STEPS); + options->fast_divide_and_NR_steps = 0; + } + /* setup interrupt handler -> mostly useful during the python execution. Let's Ctrl-C abort the extension */ SETUP_INTERRUPT_HANDLERS(interrupt_handler_countpairs_s_mu_DOUBLE); @@ -449,10 +456,11 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1 } const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1, N1, x1, y1, z1, weights1, - same_cell - ,sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax - ,ZERO, ZERO, ZERO - ,this_savg, npairs, + same_cell, + options->fast_divide_and_NR_steps, + sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax, + ZERO, ZERO, ZERO, + this_savg, npairs, this_weightavg, extra->weight_method); /* This actually causes a race condition under OpenMP - but mostly I care that an error occurred - rather than the exact value of @@ -485,7 +493,9 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1 this_weightavg = weightavg; } const int status = countpairs_s_mu_function_DOUBLE(N1, x1, y1, z1, weights1, - N2, x2, y2, z2, weights2, same_cell, + N2, x2, y2, z2, weights2, + same_cell, + options->fast_divide_and_NR_steps, sqr_smax, sqr_smin, nsbin, nmu_bins, supp_sqr, mu_max, pimax, off_xwrap, off_ywrap, off_zwrap, this_savg, npairs, diff --git a/theory/DDsmu/countpairs_s_mu_impl.h.src b/theory/DDsmu/countpairs_s_mu_impl.h.src index 7194b827..e9f57cd3 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.h.src +++ b/theory/DDsmu/countpairs_s_mu_impl.h.src @@ -22,7 +22,9 @@ extern "C" { extern void interrupt_handler_countpairs_s_mu_DOUBLE(int signo); typedef int (*countpairs_s_mu_func_ptr_DOUBLE)(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, - const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell, + const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, + const int same_cell, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax, const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap, diff --git a/theory/DDsmu/countpairs_s_mu_kernels.c.src b/theory/DDsmu/countpairs_s_mu_kernels.c.src index 72bf8044..00e026f7 100644 --- a/theory/DDsmu/countpairs_s_mu_kernels.c.src +++ b/theory/DDsmu/countpairs_s_mu_kernels.c.src @@ -23,7 +23,9 @@ #include "avx_calls.h" static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, - const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell, + const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, + const int same_cell, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax, const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap, @@ -211,9 +213,20 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max s2 = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left); - /*m_mu := sqrt(s2/dz^2) (with masked elements set to mu_max */ - const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, AVX_DIVIDE_FLOATS(m_sqr_zdiff, s2), m_mask_left)); - + /*m_sqr_mu := dz^2/s^2 (with masked elements set to mu_max */ + AVX_FLOATS m_sqr_mu = AVX_SETZERO_FLOAT(); + + + /* Check if fast_divide is enabled and either use the normal divide or + use the approx. reciprocal followed by `fast_divide_and_NR_steps` number + of Newton-Raphson steps to improve numerical accuracy. + + macro is defined in `avx_calls.h` + */ + CHECK_AND_FAST_DIVIDE(m_sqr_mu, m_sqr_zdiff, s2, fast_divide_and_NR_steps); + + const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, m_sqr_mu, m_mask_left)); + if(need_savg) { union_mDperp.m_Dperp = AVX_SQRT_FLOAT(s2); } @@ -331,13 +344,17 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE #include "sse_calls.h" static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, - const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell, + const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, + const int same_cell, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax, const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap, DOUBLE *src_savg, uint64_t *src_npairs, DOUBLE *src_weightavg, const weight_method_t weight_method) { + (void) fast_divide_and_NR_steps; + if(N0 == 0 || N1 == 0) { return EXIT_SUCCESS; } @@ -633,6 +650,7 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, const int same_cell, + const unsigned int fast_divide_and_NR_steps, const DOUBLE sqr_smax, const DOUBLE sqr_smin, const int nsbin, const int nmu_bins, const DOUBLE *supp_sqr, const DOUBLE mu_max, const DOUBLE pimax, const DOUBLE off_xwrap, const DOUBLE off_ywrap, const DOUBLE off_zwrap, @@ -640,6 +658,7 @@ static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *src_weightavg, const weight_method_t weight_method) { + (void) fast_divide_and_NR_steps; if(N0 == 0 || N1 == 0) { return EXIT_SUCCESS; } diff --git a/theory/python_bindings/_countpairs.c b/theory/python_bindings/_countpairs.c index 38debd15..22de4a66 100644 --- a/theory/python_bindings/_countpairs.c +++ b/theory/python_bindings/_countpairs.c @@ -607,8 +607,9 @@ static PyMethodDef module_methods[] = { {"countpairs_s_mu" ,(PyCFunction) countpairs_countpairs_s_mu ,METH_VARARGS | METH_KEYWORDS, "countpairs_s_mu(autocorr, nthreads, binfile, mu_max, nmu_bins, X1, Y1, Z1, weights1=None, weight_type=None,\n" " periodic=True, X2=None, Y2=None, Z2=None, weights2=None, verbose=False,\n" - " boxsize=0.0, output_savg=False, xbin_refine_factor=2, ybin_refine_factor=2,\n" - " zbin_refine_factor=1, max_cells_per_dim=100, c_api_timer=False, isa=-1)\n" + " boxsize=0.0, output_savg=False, fast_divide_and_NR_steps=0,\n" + " xbin_refine_factor=2, ybin_refine_factor=2, zbin_refine_factor=1,\n" + " max_cells_per_dim=100, c_api_timer=False, isa=-1)\n" "\n" "Calculate the 2-D pair-counts corresponding to the real-space correlation\n" "function, "XI_CHAR"(s, "MU_CHAR"). Pairs which are separated\n" @@ -691,6 +692,13 @@ static PyMethodDef module_methods[] = { " values, then pass in double precision arrays for the particle positions.\n" "\n" + "fast_divide_and_NR_steps: integer (default 0)\n" + " Replaces the division in ``AVX`` implementation with an\n" + " approximate reciprocal, followed by ``fast_divide_and_NR_steps`` " + " Newton-Raphson step. Can improve \n" + " runtime by ~15-20%. Value of 0 keeps the standard division.\n" + "\n" + "(xyz)bin_refine_factor: integer (default (2,2,1) typical values in [1-3]) \n" " Controls the refinement on the cell sizes. Can have up to a 20% impact \n" " on runtime. \n\n" @@ -2167,6 +2175,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb options.instruction_set = -1; options.periodic = 1; options.c_api_timer = 0; + options.fast_divide_and_NR_steps = 0; int8_t xbin_ref=options.bin_refine_factors[0], ybin_ref=options.bin_refine_factors[1], zbin_ref=options.bin_refine_factors[2]; @@ -2189,6 +2198,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb "verbose", /* keyword verbose -> print extra info at runtime + progressbar */ "boxsize", "output_savg", + "fast_divide_and_NR_steps", "xbin_refine_factor", "ybin_refine_factor", "zbin_refine_factor", @@ -2199,7 +2209,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb NULL }; - if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iisdiO!O!O!|O!O!O!O!O!bbdbbbbhbis", kwlist, + if ( ! PyArg_ParseTupleAndKeywords(args, kwargs, "iisdiO!O!O!|O!O!O!O!O!bbdbbbbbhbis", kwlist, &autocorr,&nthreads,&binfile, &mu_max, &nmu_bins, &PyArray_Type,&x1_obj, &PyArray_Type,&y1_obj, @@ -2213,6 +2223,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb &(options.verbose), &(options.boxsize), &(options.need_avg_sep), + &(options.fast_divide_and_NR_steps), &xbin_ref, &ybin_ref, &zbin_ref, &(options.max_cells_per_dim), &(options.c_api_timer), @@ -2224,7 +2235,7 @@ static PyObject *countpairs_countpairs_s_mu(PyObject *self, PyObject *args, PyOb fprintf(stdout, "\n"); char msg[1024]; - int len=snprintf(msg, 1024,"ArgumentError: In DDsmu> Could not parse the arguments. Input parameters are: \n"); + int len=snprintf(msg, 1024,"ArgumentError: In %s> Could not parse the arguments. Input parameters are: \n", __FUNCTION__); /* How many keywords do we have? Subtract 1 because of the last NULL */ const size_t nitems = sizeof(kwlist)/sizeof(*kwlist) - 1; diff --git a/theory/tests/Makefile b/theory/tests/Makefile index 41e7d2d1..5700748d 100644 --- a/theory/tests/Makefile +++ b/theory/tests/Makefile @@ -23,10 +23,11 @@ VPF_LIB := countspheres include $(ROOT_DIR)/theory.options $(ROOT_DIR)/common.mk TARGETS := test_periodic test_nonperiodic + ifneq ($(COMPILE_PYTHON_EXT), 0) -TARGETS += python_lib + TARGETS += python_lib else -$(warning $(ccmagenta) Skipping python tests since python or numpy is unavailable $(ccreset)) + $(warning $(ccmagenta) Skipping python tests since python or numpy is unavailable $(ccreset)) endif SRC1 := test_periodic.c $(IO_DIR)/io.c $(IO_DIR)/ftread.c $(UTILS_DIR)/utils.c diff --git a/theory/tests/test_nonperiodic.c b/theory/tests/test_nonperiodic.c index 1a6300b4..33fc47e0 100644 --- a/theory/tests/test_nonperiodic.c +++ b/theory/tests/test_nonperiodic.c @@ -335,8 +335,9 @@ int main(int argc, char **argv) options.need_avg_sep=1; options.verbose=0; options.periodic=0; + options.fast_divide_and_NR_steps=0; options.float_type=sizeof(double); - + gettimeofday(&tstart,NULL); //set the globals diff --git a/theory/tests/test_periodic.c b/theory/tests/test_periodic.c index 9a0e6615..0ec8b77b 100644 --- a/theory/tests/test_periodic.c +++ b/theory/tests/test_periodic.c @@ -576,8 +576,8 @@ int main(int argc, char **argv) options.need_avg_sep=1; options.verbose=0; options.periodic=1; + options.fast_divide_and_NR_steps=0; options.float_type=sizeof(double); - //options.instruction_set = FALLBACK; char file[]="../tests/data/gals_Mr19.ff"; char fileformat[]="f"; diff --git a/utils/Makefile b/utils/Makefile index 5d9766fb..599acce9 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -11,8 +11,8 @@ INCL := avx_calls.h sse_calls.h defs.h defs.h function_precision.h cosmology_pa gridlink_impl_double.h gridlink_impl_float.h gridlink_impl.c.src gridlink_impl.h.src \ gridlink_mocks_impl_float.h gridlink_mocks_impl_double.h gridlink_mocks_impl.h.src gridlink_mocks_impl.c.src \ progressbar.h set_cosmo_dist.h set_cosmology.h sglib.h utils.h \ - weight_functions_double.h weight_functions_float.h weight_functions.h.src \ - weight_defs_double.h weight_defs_float.h weight_defs.h.src + weight_functions_double.h weight_functions_float.h weight_functions.h.src \ + weight_defs_double.h weight_defs_float.h weight_defs.h.src all: $(TARGETOBJS) Makefile $(ROOT_DIR)/common.mk $(ROOT_DIR)/theory.options $(ROOT_DIR)/mocks.options diff --git a/utils/avx_calls.h b/utils/avx_calls.h index 52a545aa..37db6a8b 100644 --- a/utils/avx_calls.h +++ b/utils/avx_calls.h @@ -39,7 +39,9 @@ extern "C" { #define AVX_NVEC 8 #define AVX_INTS __m256i #define AVX_FLOATS __m256 - + +#define AVX_SETZERO_FLOAT() _mm256_setzero_ps() + #define AVX_LOAD_FLOATS_UNALIGNED(X) _mm256_loadu_ps(X) #define AVX_LOAD_FLOATS_ALIGNED(X) _mm256_load_ps(X) #define AVX_MULTIPLY_FLOATS(X,Y) _mm256_mul_ps(X,Y) @@ -103,6 +105,8 @@ extern "C" { #define AVX_INTS __m128i #define AVX_FLOATS __m256d +#define AVX_SETZERO_FLOAT() _mm256_setzero_pd() + #define AVX_LOAD_FLOATS_UNALIGNED(X) _mm256_loadu_pd(X) #define AVX_LOAD_FLOATS_ALIGNED(X) _mm256_load_pd(X) #define AVX_MULTIPLY_FLOATS(X,Y) _mm256_mul_pd(X,Y) @@ -198,6 +202,65 @@ static inline AVX_FLOATS inv_cosine_avx(const AVX_FLOATS X, const int order) #endif + +#ifdef DOUBLE_PREC +#define CHECK_AND_FAST_DIVIDE(result, numerator, denominator, fast_divide_and_NR_steps) { \ + /* For double precision floats */ \ + if (fast_divide_and_NR_steps == 0) { \ + result = AVX_DIVIDE_FLOATS(numerator, denominator); \ + /* The divide is the actual operation we need */ \ + /* but divides are about 10x slower than multiplies. So, I am replacing it */ \ + /* with a approximate reciprocal in floating point */ \ + /* + 2 iterations of newton-raphson in case of DOUBLE */ \ + } else { \ + unsigned int _ii; \ + /* following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson */ \ + const __m128 float_tmp1 = _mm256_cvtpd_ps(denominator);/* convert double to float -> not avx_floats := _m256d */ \ + /*(convert 4 doubles into 4 floats -> use half of available 256 bit SIMD registers) */ \ + __m128 float_inv_tmp1 = _mm_rcp_ps(float_tmp1);/* intrinsic for 128 bit float approximate reciprocal */ \ + const AVX_FLOATS rc = _mm256_cvtps_pd(float_inv_tmp1);/* convert back to double */ \ + /* We have the double->float->approx. reciprocal->double process done. */ \ + /* Now improve the accuracy of the divide with newton-raphson. */ \ + const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0); \ + AVX_FLOATS rc_iter = rc; \ + /* Do NewtonRaphson iterations */ \ + for(_ii=0;_iifloat->approx. reciprocal->double process done. */ \ + /* Now improve the accuracy of the divide with newton-raphson. */ \ + const AVX_FLOATS two = AVX_SET_FLOAT((DOUBLE) 2.0); \ + AVX_FLOATS rc_iter = rc; \ + /* Do NewtonRaphson iterations */ \ + for(_ii=0;_ii 0, the value is interpreted as the number of NR steps + i.e., fast_divide_and_NR_steps = 2, performs two steps of Newton-Raphson + Anything greater than ~5, probably makes the code slower than the + divide without any improvement in precision + */ + /* Fast arccos for wtheta (effective only when OUTPUT_THETAAVG is enabled) */ uint8_t fast_acos; @@ -269,13 +277,11 @@ static inline struct config_options get_config_options(void) #endif /* Options specific to mocks */ - /* Options for DDrppi_mocks */ -#ifdef FAST_DIVIDE - options.fast_divide=1; +#if defined(FAST_DIVIDE) + options.fast_divide_and_NR_steps=FAST_DIVIDE; #endif - /* Options for wtheta*/ #ifdef OUTPUT_THETAAVG options.need_avg_sep = 1;