From 6248c55f1a74a78835e298d1f4da19ba26df5bc8 Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Thu, 17 Mar 2022 16:54:43 +0100 Subject: [PATCH] Adjusted Clang format (#585) Other: allow to experiment with untuned/default parameters * Accept to specify a generic GPU (CUDA/HIP). --- .clang-format | 7 +- src/acc/cuda/Makefile | 82 ++++++++++++----------- src/acc/libsmm_acc/generate_parameters.py | 39 ++++++----- src/acc/opencl/Makefile | 63 ++++++++--------- src/acc/opencl/acc_opencl.c | 29 +++++--- src/acc/opencl/acc_opencl_event.c | 7 +- src/acc/opencl/acc_opencl_stream.c | 7 +- src/acc/opencl/smm/opencl_libsmm.c | 33 ++++++--- 8 files changed, 151 insertions(+), 116 deletions(-) diff --git a/.clang-format b/.clang-format index f75d1b7958c..9ef69e9efc8 100644 --- a/.clang-format +++ b/.clang-format @@ -3,11 +3,12 @@ AlignAfterOpenBracket: DontAlign AlignEscapedNewlines: DontAlign AlignTrailingComments: false AllowShortCaseLabelsOnASingleLine: true -AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortIfStatementsOnASingleLine: AllIfsAndElse AllowShortLoopsOnASingleLine: true BraceWrapping: - BeforeCatch: true - BeforeElse: true + AfterControlStatement: MultiLine + BeforeCatch: true + BeforeElse: true BreakBeforeBraces: Custom ColumnLimit: 132 ConstructorInitializerIndentWidth: 0 diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile index b5f9e43592a..83802f30976 100644 --- a/src/acc/cuda/Makefile +++ b/src/acc/cuda/Makefile @@ -2,25 +2,27 @@ # It is for testing and comparison with other implementations. MAKDIR := $(subst //,,$(dir $(firstword $(MAKEFILE_LIST)))/) -INCACC := $(wildcard $(MAKDIR)/*.h*) $(MAKDIR)/../acc.h -SRCACC := $(wildcard $(MAKDIR)/../cuda_hip/*.cpp) \ +ACCDIR := $(MAKDIR)/.. +DIRSMM := $(ACCDIR)/libsmm_acc +INCACC := $(wildcard $(MAKDIR)/*.h*) $(ACCDIR)/acc.h +SRCACC := $(wildcard $(ACCDIR)/cuda_hip/*.cpp) \ $(wildcard $(MAKDIR)/*.cpp) \ $(NULL) OBJACC := $(SRCACC:.cpp=.o) -GPUSMM := $(wildcard $(MAKDIR)/../libsmm_acc/kernels/*.h*) -INCSMM := $(wildcard $(MAKDIR)/../libsmm_acc/*.h*) \ - $(MAKDIR)/../libsmm_acc/smm_acc_kernels.h \ - $(MAKDIR)/../libsmm_acc/parameters.h \ - $(MAKDIR)/../acc_libsmm.h \ - $(MAKDIR)/../acc_bench.h \ +GPUSMM := $(wildcard $(DIRSMM)/kernels/*.h*) +INCSMM := $(wildcard $(DIRSMM)/*.h*) \ + $(DIRSMM)/parameters.h \ + $(DIRSMM)/smm_acc_kernels.h \ + $(ACCDIR)/acc_libsmm.h \ + $(ACCDIR)/acc_bench.h \ $(NULL) -SRCSMM := $(wildcard $(MAKDIR)/../libsmm_acc/*.cpp) +SRCSMM := $(wildcard $(DIRSMM)/*.cpp) OBJSMM := $(SRCSMM:.cpp=.o) INCALL := $(INCACC) $(INCSMM) -LIBXSMMROOT := $(wildcard $(MAKDIR)/../../../../libxsmm) +LIBXSMMROOT := $(wildcard $(ACCDIR)/../../../libxsmm) ifeq (,$(LIBXSMMROOT)) LIBXSMMROOT := $(wildcard $(HOME)/libxsmm) endif @@ -63,7 +65,7 @@ else ifeq ($(WITH_GPU),P100) else ifeq ($(WITH_GPU),V100) ARCH_NUMBER = 70 else ifeq ($(WITH_GPU),A100) - # TODO: update when tuned parameters for A100 available + # TODO: update for A100 tuned parameters override WITH_GPU := V100 ARCH_NUMBER = 80 else ifeq (,$(ARCH_NUMBER)) @@ -167,22 +169,22 @@ LDFLAGS += -lcudart -lcublas -lnvrtc -lcuda CXXFLAGS += -std=c++11 $(CFLAGS) .PHONY: bench -bench: $(MAKDIR)/../acc_bench_smm $(MAKDIR)/../acc_bench_trans +bench: $(ACCDIR)/acc_bench_smm $(ACCDIR)/acc_bench_trans .PHONY: all -all: bench $(MAKDIR)/../dbcsr_acc_test +all: bench $(ACCDIR)/dbcsr_acc_test .PHONY: test test: test-interface test-trans test-smm .PHONY: test-interface -test-interface: $(MAKDIR)/../dbcsr_acc_test +test-interface: $(ACCDIR)/dbcsr_acc_test @echo "--- DBCSR Backend Interface" - $(MAKDIR)/../dbcsr_acc_test + $(ACCDIR)/dbcsr_acc_test .PHONY: test-trans test-trans: bench - $(eval SHAPES = $(shell $(MAKDIR)/../acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NTRANS) -a)) + $(eval SHAPES = $(shell $(ACCDIR)/acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NTRANS) -a)) @echo "--- DBCSR CUDA Transposes ($(words $(SHAPES)))" @echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ifneq (,$(LD_PRELOAD)) @@ -191,16 +193,16 @@ endif @echo "CXX: $$($(CXX) --version | head -n1)" @echo "CC: $$($(CC) --version | head -n1)" @echo "runtime libraries:" - @ldd $(MAKDIR)/../acc_bench_trans + @ldd $(ACCDIR)/acc_bench_trans @echo "hostname: $$(hostname)" @echo @for SHAPE in $(SHAPES); do \ - $(MAKDIR)/../acc_bench_trans $${SHAPE} || exit 1; \ + $(ACCDIR)/acc_bench_trans $${SHAPE} || exit 1; \ echo; \ done $(MAKDIR)/test-smm.log: bench - $(eval SHAPES = $(shell $(MAKDIR)/../acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NSMMS))) + $(eval SHAPES = $(shell $(ACCDIR)/acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NSMMS))) @echo "--- DBCSR CUDA SMMs ($(words $(SHAPES)))" @echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ifneq (,$(LD_PRELOAD)) @@ -209,11 +211,11 @@ endif @echo "CXX: $$($(CXX) --version | head -n1)" @echo "CC: $$($(CC) --version | head -n1)" @echo "runtime libraries:" - @ldd $(MAKDIR)/../acc_bench_smm + @ldd $(ACCDIR)/acc_bench_smm @echo "hostname: $$(hostname)" @echo @echo "$(SHAPES)" | xargs -n1 | \ - (CHECK=$(if $(CHECK),$(CHECK),1) stdbuf --output=L $(MAKDIR)/../acc_bench_smm /dev/stdin \ + (CHECK=$(if $(CHECK),$(CHECK),1) stdbuf --output=L $(ACCDIR)/acc_bench_smm /dev/stdin \ 2>$(MAKDIR)/test-smm.err && rm $(MAKDIR)/test-smm.err) | tee $@ @if [ -s $(MAKDIR)/test-smm.err ]; then cat $(MAKDIR)/test-smm.err && exit 1; fi @@ -227,42 +229,44 @@ endif @echo "mean: $$(sed -n "/device:/p" $< | datamash -W -R 1 mean 4) GFLOPS/s" endif -$(MAKDIR)/../libsmm_acc/parameters.h: $(MAKDIR)/Makefile $(MAKDIR)/../libsmm_acc/generate_parameters.py $(MAKDIR)/../libsmm_acc/parameters/parameters_$(WITH_GPU).json - @cd $(MAKDIR)/../libsmm_acc && $(PYTHON) ../libsmm_acc/generate_parameters.py --gpu_version=$(WITH_GPU) --base_dir=../libsmm_acc/parameters +PARDIR := $(DIRSMM)/parameters +PARAMS := $(wildcard $(PARDIR)/parameters_$(WITH_GPU).json) +$(DIRSMM)/parameters.h: $(MAKDIR)/Makefile $(DIRSMM)/generate_parameters.py $(PARAMS) + @cd $(DIRSMM) && $(PYTHON) ../libsmm_acc/generate_parameters.py --gpu_version=$(WITH_GPU) --base_dir=../libsmm_acc/parameters -$(MAKDIR)/../libsmm_acc/smm_acc_kernels.h: $(GPUSMM) $(MAKDIR)/Makefile $(MAKDIR)/../libsmm_acc/generate_kernels.py $(MAKDIR)/../libsmm_acc/parameters/parameters_$(WITH_GPU).json - @cd $(MAKDIR)/../libsmm_acc && $(PYTHON) ../libsmm_acc/generate_kernels.py ../libsmm_acc/kernels +$(DIRSMM)/smm_acc_kernels.h: $(GPUSMM) $(MAKDIR)/Makefile $(DIRSMM)/generate_kernels.py $(PARAMS) + @cd $(DIRSMM) && $(PYTHON) ../libsmm_acc/generate_kernels.py ../libsmm_acc/kernels -$(MAKDIR)/../dbcsr_acc.a: $(OBJACC) $(MAKDIR)/../libsmm_acc/libsmm_acc_init.o +$(ACCDIR)/dbcsr_acc.a: $(OBJACC) $(DIRSMM)/libsmm_acc_init.o $(AR) -rs $@ $^ -$(MAKDIR)/../dbcsr_acc_smm.a: $(OBJSMM) +$(ACCDIR)/dbcsr_acc_smm.a: $(OBJSMM) $(AR) -rs $@ $^ %.o: %.cpp $(INCALL) $(MAKDIR)/Makefile $(CXX) $(CXXFLAGS) $(CFLAGS_XSMM) -c $< -o $@ -$(MAKDIR)/acc_bench_smm.o: $(MAKDIR)/../acc_bench_smm.c $(MAKDIR)/Makefile +$(MAKDIR)/acc_bench_smm.o: $(ACCDIR)/acc_bench_smm.c $(MAKDIR)/Makefile ifneq (0,$(LIBXSMM)) $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ else $(CC) $(CFLAGS) -c $< -o $@ endif -$(MAKDIR)/../acc_bench_smm: $(MAKDIR)/acc_bench_smm.o $(MAKDIR)/../dbcsr_acc.a $(MAKDIR)/../dbcsr_acc_smm.a +$(ACCDIR)/acc_bench_smm: $(MAKDIR)/acc_bench_smm.o $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a $(CXX) $^ $(LDFLAGS) -o $@ -$(MAKDIR)/acc_bench_trans.o: $(MAKDIR)/../acc_bench_trans.c $(MAKDIR)/Makefile +$(MAKDIR)/acc_bench_trans.o: $(ACCDIR)/acc_bench_trans.c $(MAKDIR)/Makefile ifneq (0,$(LIBXSMM)) $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ else $(CC) $(CFLAGS) -c $< -o $@ endif -$(MAKDIR)/../acc_bench_trans: $(MAKDIR)/acc_bench_trans.o $(MAKDIR)/../dbcsr_acc.a $(MAKDIR)/../dbcsr_acc_smm.a +$(ACCDIR)/acc_bench_trans: $(MAKDIR)/acc_bench_trans.o $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a $(CXX) $^ $(LDFLAGS) -o $@ -$(MAKDIR)/dbcsr_acc_test.o: $(MAKDIR)/../../../tests/dbcsr_acc_test.c $(MAKDIR)/Makefile - $(CC) $(CFLAGS) -I$(MAKDIR)/../.. -c $< -o $@ -$(MAKDIR)/../dbcsr_acc_test: $(MAKDIR)/dbcsr_acc_test.o $(MAKDIR)/../dbcsr_acc.a $(MAKDIR)/../dbcsr_acc_smm.a +$(MAKDIR)/dbcsr_acc_test.o: $(ACCDIR)/../../tests/dbcsr_acc_test.c $(MAKDIR)/Makefile + $(CC) $(CFLAGS) -I$(ACCDIR)/.. -c $< -o $@ +$(ACCDIR)/dbcsr_acc_test: $(MAKDIR)/dbcsr_acc_test.o $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a $(CXX) $^ $(LDFLAGS) -o $@ .PHONY: clean @@ -271,13 +275,13 @@ clean: @rm -f $(MAKDIR)/dbcsr_acc_test.o @rm -f $(MAKDIR)/acc_bench_trans.o @rm -f $(MAKDIR)/acc_bench_smm.o - @rm -f $(MAKDIR)/../libsmm_acc/parameters.h - @rm -f $(MAKDIR)/../libsmm_acc/smm_acc_kernels.h + @rm -f $(DIRSMM)/parameters.h + @rm -f $(DIRSMM)/smm_acc_kernels.h @rm -f $(MAKDIR)/test-smm.err .PHONY: realclean realclean: clean - @rm -f $(MAKDIR)/../dbcsr_acc.a $(MAKDIR)/../dbcsr_acc_smm.a - @rm -f $(MAKDIR)/../acc_bench_smm $(MAKDIR)/../acc_bench_trans - @rm -f $(MAKDIR)/../dbcsr_acc_test + @rm -f $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a + @rm -f $(ACCDIR)/acc_bench_smm $(ACCDIR)/acc_bench_trans + @rm -f $(ACCDIR)/dbcsr_acc_test @rm -f $(MAKDIR)/test-smm.log diff --git a/src/acc/libsmm_acc/generate_parameters.py b/src/acc/libsmm_acc/generate_parameters.py index b772d42b303..4b16996a5e9 100755 --- a/src/acc/libsmm_acc/generate_parameters.py +++ b/src/acc/libsmm_acc/generate_parameters.py @@ -20,20 +20,28 @@ # =============================================================================== def main(gpu_version, base_dir): - # Read existing parameters - print("GPU version: {}".format(gpu_version)) - param_fn = path.join(base_dir, "parameters_{}.json".format(gpu_version)) - with open(param_fn) as f: - all_kernels = [params_dict_to_kernel(**params) for params in json.load(f)] - print( - "About to process {:,} kernels from file {}".format(len(all_kernels), param_fn) - ) - - # Read GPU properties (warp size) - gpu_props_fn = path.join(base_dir, "../kernels/gpu_properties.json") - arch_code = gpu_architectures[path.basename(param_fn)] - with open(gpu_props_fn) as f: - gpu_warp_size = json.load(f)[arch_code]["Threads_/_Warp"] + try: # Read existing parameters + param_fn = path.join(base_dir, "parameters_{}.json".format(gpu_version)) + with open(param_fn) as f: + print("GPU version: {}".format(gpu_version)) + all_kernels = [params_dict_to_kernel(**params) for params in json.load(f)] + print( + "About to process {:,} kernels from file {}".format( + len(all_kernels), param_fn + ) + ) + except: # noqa: E722 + all_kernels = [] + pass + + try: # Read GPU properties (warp size) + gpu_props_fn = path.join(base_dir, "../kernels/gpu_properties.json") + arch_code = gpu_architectures[path.basename(param_fn)] + with open(gpu_props_fn) as f: + gpu_warp_size = json.load(f)[arch_code]["Threads_/_Warp"] + except: # noqa: E722 + gpu_warp_size = 32 + pass print("GPU warp size: {}".format(gpu_warp_size)) # Construct output @@ -41,7 +49,8 @@ def main(gpu_version, base_dir): # Write to c++ header-file file_h = "parameters.h" - print("Found {:,} kernels in file {}".format(len(all_kernels), param_fn)) + if all_kernels: + print("Found {:,} kernels in file {}".format(len(all_kernels), param_fn)) print("Printing them to file {}".format(file_h)) with open(file_h, "w") as f: f.write(out) diff --git a/src/acc/opencl/Makefile b/src/acc/opencl/Makefile index a009626b408..342af943599 100644 --- a/src/acc/opencl/Makefile +++ b/src/acc/opencl/Makefile @@ -1,12 +1,13 @@ MAKDIR := $(subst //,,$(dir $(firstword $(MAKEFILE_LIST)))/) -INCACC := $(wildcard $(MAKDIR)/*.h*) $(MAKDIR)/../acc.h +ACCDIR := $(MAKDIR)/.. +INCACC := $(wildcard $(MAKDIR)/*.h*) $(ACCDIR)/acc.h SRCACC := $(wildcard $(MAKDIR)/*.c) OBJACC := $(SRCACC:.c=.o) INCSMM := $(wildcard $(MAKDIR)/smm/*.h*) \ $(MAKDIR)/smm/opencl_kernels.h \ - $(MAKDIR)/../acc_libsmm.h \ - $(MAKDIR)/../acc_bench.h \ + $(ACCDIR)/acc_libsmm.h \ + $(ACCDIR)/acc_bench.h \ $(NULL) SRCSMM := $(wildcard $(MAKDIR)/smm/*.c) OBJSMM := $(SRCSMM:.c=.o) @@ -14,7 +15,7 @@ KERNEL := $(wildcard $(MAKDIR)/smm/kernels/*.cl) INCALL := $(INCACC) $(INCSMM) -LIBXSMMROOT := $(wildcard $(MAKDIR)/../../../../libxsmm) +LIBXSMMROOT := $(wildcard $(ACCDIR)/../../../libxsmm) ifeq (,$(LIBXSMMROOT)) LIBXSMMROOT := $(wildcard $(HOME)/libxsmm) endif @@ -176,23 +177,23 @@ ifneq (,$(wildcard $(CXXLIBDIR))) endif .PHONY: bench -bench: $(MAKDIR)/../acc_bench_smm $(MAKDIR)/../acc_bench_trans +bench: $(ACCDIR)/acc_bench_smm $(ACCDIR)/acc_bench_trans .PHONY: all -all: bench $(MAKDIR)/../dbcsr_acc_test +all: bench $(ACCDIR)/dbcsr_acc_test .PHONY: test test: test-interface test-trans test-smm .PHONY: test-interface -test-interface: $(MAKDIR)/../dbcsr_acc_test +test-interface: $(ACCDIR)/dbcsr_acc_test @echo "--- DBCSR Backend Interface" - $(MAKDIR)/../dbcsr_acc_test + $(ACCDIR)/dbcsr_acc_test .PHONY: test-trans test-trans: bench - $(eval SHAPES = $(shell $(MAKDIR)/../acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NTRANS) -a)) - $(eval DEVICE = $(shell ACC_OPENCL_VERBOSE=1 CHECK=0 $(MAKDIR)/../acc_bench_smm 1 1 1 2>&1 >/dev/null)) + $(eval SHAPES = $(shell $(ACCDIR)/acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NTRANS) -a)) + $(eval DEVICE = $(shell ACC_OPENCL_VERBOSE=1 CHECK=0 $(ACCDIR)/acc_bench_smm 1 1 1 2>&1 >/dev/null)) @echo "--- DBCSR OpenCL Transposes ($(words $(SHAPES)))" @echo "$(DEVICE)" @echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" @@ -201,17 +202,17 @@ ifneq (,$(LD_PRELOAD)) endif @echo "CC: $$($(CC) --version | head -n1)" @echo "runtime libraries:" - @ldd $(MAKDIR)/../acc_bench_trans + @ldd $(ACCDIR)/acc_bench_trans @echo "hostname: $$(hostname)" @echo @for SHAPE in $(SHAPES); do \ - $(MAKDIR)/../acc_bench_trans $${SHAPE} || exit 1; \ + $(ACCDIR)/acc_bench_trans $${SHAPE} || exit 1; \ echo; \ done $(MAKDIR)/test-smm.log: bench - $(eval SHAPES = $(shell $(MAKDIR)/../acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NSMMS))) - $(eval DEVICE = "$(shell LIBXSMM_VERBOSE=0 ACC_OPENCL_VERBOSE=1 CHECK=0 $(MAKDIR)/../acc_bench_smm 1 1 1 2>&1 >/dev/null)") + $(eval SHAPES = $(shell $(ACCDIR)/acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NSMMS))) + $(eval DEVICE = "$(shell LIBXSMM_VERBOSE=0 ACC_OPENCL_VERBOSE=1 CHECK=0 $(ACCDIR)/acc_bench_smm 1 1 1 2>&1 >/dev/null)") $(eval WITH_GPU = $(firstword $(foreach GPU,$(WITH_GPUS),$(findstring $(GPU),$(DEVICE))))) $(eval PARAMS = $(firstword $(wildcard $(PARAMS_DIR)/tune_multiply_*$(WITH_GPU).csv))) $(eval GPUENV = $(if $(PARAMS),$(if $(OPENCL_LIBSMM_SMM_PARAMS),$(NULL),OPENCL_LIBSMM_SMM_PARAMS=$(PARAMS)))) @@ -224,11 +225,11 @@ ifneq (,$(LD_PRELOAD)) endif @echo "CC: $$($(CC) --version | head -n1)" @echo "runtime libraries:" - @ldd $(MAKDIR)/../acc_bench_smm + @ldd $(ACCDIR)/acc_bench_smm @echo "hostname: $$(hostname)" @echo @echo "$(SHAPES)" | xargs -n1 | \ - ($(GPUENV) CHECK=$(if $(CHECK),$(CHECK),1) stdbuf --output=L $(MAKDIR)/../acc_bench_smm /dev/stdin \ + ($(GPUENV) CHECK=$(if $(CHECK),$(CHECK),1) stdbuf --output=L $(ACCDIR)/acc_bench_smm /dev/stdin \ 2>$(MAKDIR)/test-smm.err && rm $(MAKDIR)/test-smm.err) | tee $@ @if [ -s $(MAKDIR)/test-smm.err ]; then cat $(MAKDIR)/test-smm.err && exit 1; fi @@ -245,51 +246,51 @@ endif $(MAKDIR)/smm/opencl_kernels.h: $(MAKDIR)/acc_opencl.sh $(KERNEL) $(PARAMS) @CPPFLAGS=$(CPP_OPENCL_FLAGS) $(MAKDIR)/acc_opencl.sh $(KERNEL) $(PARAMS) $@ -$(MAKDIR)/../dbcsr_acc.a: $(OBJACC) +$(ACCDIR)/dbcsr_acc.a: $(OBJACC) $(AR) -rs $@ $^ -$(MAKDIR)/../dbcsr_acc_smm.a: $(OBJSMM) +$(ACCDIR)/dbcsr_acc_smm.a: $(OBJSMM) $(AR) -rs $@ $^ %.o: %.c $(INCALL) $(MAKDIR)/Makefile $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ -$(MAKDIR)/acc_bench_smm.o: $(MAKDIR)/../acc_bench_smm.c $(MAKDIR)/Makefile +$(MAKDIR)/acc_bench_smm.o: $(ACCDIR)/acc_bench_smm.c $(MAKDIR)/Makefile ifneq (0,$(LIBXSMM)) $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ else $(CC) $(CFLAGS) -c $< -o $@ endif -$(MAKDIR)/../acc_bench_smm: $(MAKDIR)/acc_bench_smm.o $(MAKDIR)/../dbcsr_acc_smm.a $(MAKDIR)/../dbcsr_acc.a +$(ACCDIR)/acc_bench_smm: $(MAKDIR)/acc_bench_smm.o $(ACCDIR)/dbcsr_acc_smm.a $(ACCDIR)/dbcsr_acc.a ifneq (,$(filter 0 1,$(DEV))) $(CC) $^ $(LDFLAGS) -o $@ else -.PHONY: $(MAKDIR)/../acc_bench_smm +.PHONY: $(ACCDIR)/acc_bench_smm endif -$(MAKDIR)/acc_bench_trans.o: $(MAKDIR)/../acc_bench_trans.c $(MAKDIR)/Makefile +$(MAKDIR)/acc_bench_trans.o: $(ACCDIR)/acc_bench_trans.c $(MAKDIR)/Makefile ifneq (0,$(LIBXSMM)) $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ else $(CC) $(CFLAGS) -c $< -o $@ endif -$(MAKDIR)/../acc_bench_trans: $(MAKDIR)/acc_bench_trans.o $(MAKDIR)/../dbcsr_acc_smm.a $(MAKDIR)/../dbcsr_acc.a +$(ACCDIR)/acc_bench_trans: $(MAKDIR)/acc_bench_trans.o $(ACCDIR)/dbcsr_acc_smm.a $(ACCDIR)/dbcsr_acc.a ifneq (,$(filter 0 1,$(DEV))) $(CC) $^ $(LDFLAGS) -o $@ else -.PHONY: $(MAKDIR)/../acc_bench_trans +.PHONY: $(ACCDIR)/acc_bench_trans endif -$(MAKDIR)/dbcsr_acc_test.o: $(MAKDIR)/../../../tests/dbcsr_acc_test.c $(MAKDIR)/Makefile - $(CC) $(CFLAGS) -I$(MAKDIR)/../.. -c $< -o $@ +$(MAKDIR)/dbcsr_acc_test.o: $(ACCDIR)/../../tests/dbcsr_acc_test.c $(MAKDIR)/Makefile + $(CC) $(CFLAGS) -I$(ACCDIR)/.. -c $< -o $@ -$(MAKDIR)/../dbcsr_acc_test: $(MAKDIR)/dbcsr_acc_test.o $(MAKDIR)/../dbcsr_acc.a +$(ACCDIR)/dbcsr_acc_test: $(MAKDIR)/dbcsr_acc_test.o $(ACCDIR)/dbcsr_acc.a ifneq (,$(filter 0 1,$(DEV))) $(CC) $^ $(LDFLAGS) -o $@ else -.PHONY: $(MAKDIR)/../dbcsr_acc_test +.PHONY: $(ACCDIR)/dbcsr_acc_test endif .PHONY: clean @@ -303,7 +304,7 @@ clean: .PHONY: realclean realclean: clean - @rm -f $(MAKDIR)/../dbcsr_acc.a $(MAKDIR)/../dbcsr_acc_smm.a - @rm -f $(MAKDIR)/../acc_bench_smm $(MAKDIR)/../acc_bench_trans - @rm -f $(MAKDIR)/../dbcsr_acc_test + @rm -f $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a + @rm -f $(ACCDIR)/acc_bench_smm $(ACCDIR)/acc_bench_trans + @rm -f $(ACCDIR)/dbcsr_acc_test @rm -f $(MAKDIR)/test-smm.log diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 50f60c0bf28..fb3be31d85b 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -99,8 +99,8 @@ cl_context c_dbcsr_acc_opencl_device_context(cl_device_id device, const int* thr result = c_dbcsr_acc_opencl_config.contexts[tid]; if (NULL != result) { cl_device_id device_id = NULL; - if (CL_SUCCESS == clGetContextInfo(result, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device_id, NULL) && - device == device_id) { + if (CL_SUCCESS == clGetContextInfo(result, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device_id, NULL) && device == device_id) + { break; } else { @@ -298,7 +298,8 @@ int c_dbcsr_acc_init(void) { n = ACC_OPENCL_DEVICES_MAXCOUNT - (cl_uint)c_dbcsr_acc_opencl_config.ndevices; } if (EXIT_SUCCESS == clCreateSubDevices(devices[j], properties, n, - c_dbcsr_acc_opencl_config.devices + c_dbcsr_acc_opencl_config.ndevices, NULL)) { + c_dbcsr_acc_opencl_config.devices + c_dbcsr_acc_opencl_config.ndevices, NULL)) + { ACC_OPENCL_CHECK(clReleaseDevice(devices[j]), "release device", result); c_dbcsr_acc_opencl_config.ndevices += n; } @@ -361,7 +362,8 @@ int c_dbcsr_acc_init(void) { else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype && CL_DEVICE_TYPE_GPU == itype && device_id <= (int)i) { result = clGetDeviceInfo(c_dbcsr_acc_opencl_config.devices[i], CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL); if (CL_SUCCESS == result /* prune for homogeneous set of GPUs */ - && ('\0' == *tmp || 0 == strncmp(buffer, tmp, ACC_OPENCL_BUFFERSIZE))) { + && ('\0' == *tmp || 0 == strncmp(buffer, tmp, ACC_OPENCL_BUFFERSIZE))) + { c_dbcsr_acc_opencl_config.ndevices = i + 1; strncpy(tmp, buffer, ACC_OPENCL_BUFFERSIZE); } @@ -494,7 +496,8 @@ int c_dbcsr_acc_finalize(void) { int d; fprintf(stderr, "INFO ACC/OpenCL: pid=%u nthreads=%i", libxsmm_get_pid(), c_dbcsr_acc_opencl_config.nthreads); if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device(0, &device) && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_id(device, NULL /*devid*/, &d)) { + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_id(device, NULL /*devid*/, &d)) + { fprintf(stderr, " device=%i", d); } if (NULL != c_dbcsr_acc_opencl_config.stats) { @@ -836,7 +839,8 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) { int dev = 0; if (CL_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL) && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_id(active_id, NULL /*devid*/, &dev) && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_uid(active_id, &uid)) { + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_uid(active_id, &uid)) + { fprintf(stderr, "INFO ACC/OpenCL: ndevices=%i device%i=\"%s\" uid=0x%08X\n", c_dbcsr_acc_opencl_config.ndevices, dev, buffer, uid); } @@ -912,7 +916,8 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) { } # endif if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), - &c_dbcsr_acc_opencl_config.devinfo.unified, NULL)) { + &c_dbcsr_acc_opencl_config.devinfo.unified, NULL)) + { c_dbcsr_acc_opencl_config.devinfo.unified = CL_FALSE; } if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel")) { @@ -971,7 +976,8 @@ int c_dbcsr_acc_opencl_device_synchronize(int thread_id) { int result = EXIT_SUCCESS; if (0 == (4 & c_dbcsr_acc_opencl_config.flush) && (1 > c_dbcsr_acc_opencl_config.share || - 0 == (thread_id % (1 != c_dbcsr_acc_opencl_config.share ? c_dbcsr_acc_opencl_config.share : 2)))) { + 0 == (thread_id % (1 != c_dbcsr_acc_opencl_config.share ? c_dbcsr_acc_opencl_config.share : 2)))) + { void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id; int i = 0; assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); @@ -1094,13 +1100,14 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con for (; NULL != ext; ext = ((ext + 1) < end ? strtok((ext + 1) + strlen(ext), ACC_OPENCL_DELIMS " \t") : NULL)) { const char* line = source; for (;;) { - if (2 != - sscanf(line, "#pragma OPENCL EXTENSION %[^: ]%*[: ]%[^\n]", buffer, buffer + ACC_OPENCL_BUFFERSIZE / 2)) { + if (2 != sscanf(line, "#pragma OPENCL EXTENSION %[^: ]%*[: ]%[^\n]", buffer, buffer + ACC_OPENCL_BUFFERSIZE / 2)) + { line = NULL; break; } else if (0 == strncmp(buffer, ext, ACC_OPENCL_BUFFERSIZE / 2) && - 0 == strncmp(buffer + ACC_OPENCL_BUFFERSIZE / 2, "enable", ACC_OPENCL_BUFFERSIZE / 2)) { + 0 == strncmp(buffer + ACC_OPENCL_BUFFERSIZE / 2, "enable", ACC_OPENCL_BUFFERSIZE / 2)) + { break; } line = strchr(line, '\n'); diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index 0d91e8d58cd..41e1769d8cd 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -35,8 +35,7 @@ extern "C" { int c_dbcsr_acc_opencl_event_create(cl_event* event_p) { int result; assert(NULL != event_p); - if (NULL != *event_p) - result = EXIT_SUCCESS; + if (NULL != *event_p) result = EXIT_SUCCESS; else { *event_p = clCreateUserEvent(c_dbcsr_acc_opencl_context(), &result); } @@ -162,7 +161,9 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an ev # else if (NULL != clevent) # endif - { result = ACC_OPENCL_WAIT_EVENT(*ACC_OPENCL_STREAM(stream), &clevent); } + { + result = ACC_OPENCL_WAIT_EVENT(*ACC_OPENCL_STREAM(stream), &clevent); + } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); # endif diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index fd0fbb12777..7c56f5c8a22 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -107,7 +107,8 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { else { int least = -1, greatest = -1; if (0 != (1 & c_dbcsr_acc_opencl_config.priority) && EXIT_SUCCESS == c_dbcsr_acc_stream_priority_range(&least, &greatest) && - least != greatest) { + least != greatest) + { properties[3] = (0 != (2 & c_dbcsr_acc_opencl_config.priority) && (NULL != libxsmm_stristr(name, "calc") || (NULL != strstr(name, "priority")))) ? CL_QUEUE_PRIORITY_HIGH_KHR @@ -334,8 +335,8 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { ACC_OPENCL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve platform extensions", result); if (EXIT_SUCCESS == result) { - if (NULL != strstr(buffer, "cl_khr_priority_hints") || - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) { + if (NULL != strstr(buffer, "cl_khr_priority_hints") || EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) + { priohi = CL_QUEUE_PRIORITY_HIGH_KHR; priolo = CL_QUEUE_PRIORITY_LOW_KHR; } diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index 994e60692da..ad4cfb5369a 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -249,7 +249,8 @@ int opencl_libsmm_read_smm_params( double gflops; assert(NULL != key && NULL != value); for (; NULL != s; - ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed) { + ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed) + { switch (i) { case 0: if (NULL != device && 1 == sscanf(s, "%[^" ACC_OPENCL_DELIMS "]", device)) { @@ -440,7 +441,8 @@ int libsmm_acc_init(void) { memset(&perfest, 0, sizeof(perfest)); if (NULL != env_timer && (opencl_libsmm_timer_host == atoi(env_timer) || (env_timer == libxsmm_stristr(env_timer, "host") && 4 == strlen(env_timer)) || - (env_timer == libxsmm_stristr(env_timer, "cpu") && 3 == strlen(env_timer)))) { + (env_timer == libxsmm_stristr(env_timer, "cpu") && 3 == strlen(env_timer)))) + { opencl_libsmm_timer = opencl_libsmm_timer_host; } if (NULL == env_params || '0' != *env_params) { @@ -526,7 +528,8 @@ int libsmm_acc_init(void) { # endif if ('2' == control) { if (/* try reading OPENCL_LIBSMM_SMM_PARAMS-value as kernel parameters */ - EXIT_SUCCESS == opencl_libsmm_read_smm_params(env_params, &key, &config, NULL /*perfest*/, NULL /*device*/)) { + EXIT_SUCCESS == opencl_libsmm_read_smm_params(env_params, &key, &config, NULL /*perfest*/, NULL /*device*/)) + { key.devuid = 0; if (NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config)) { result = EXIT_FAILURE; @@ -763,7 +766,8 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v 0 # endif ) && - 0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim) { + 0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim) + { const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); opencl_libsmm_trans_t* config; opencl_libsmm_transkey_t key; @@ -1080,7 +1084,8 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable( if (0 < m_max && 0 < n_max && 0 < k_max && 0 < stack_size /* allow k_max to exceed max_kernel_dim, TODO: BLAS for large kernels (m,n) */ - && m_max <= max_kernel_dim && n_max <= max_kernel_dim && 0 != def_mnk /*homogeneous*/) { + && m_max <= max_kernel_dim && n_max <= max_kernel_dim && 0 != def_mnk /*homogeneous*/) + { # if defined(OPENCL_LIBSMM_SUITABLE) const double ai = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(double)); hst = ai * opencl_libsmm_dhst; @@ -1098,7 +1103,8 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable( if (0 < m_max && 0 < n_max && 0 < k_max && 0 < stack_size /* allow k_max to exceed max_kernel_dim , TODO: BLAS for large kernels (m,n) */ - && m_max <= max_kernel_dim && n_max <= max_kernel_dim && 0 != def_mnk /*homogeneous*/) { + && m_max <= max_kernel_dim && n_max <= max_kernel_dim && 0 != def_mnk /*homogeneous*/) + { # if defined(OPENCL_LIBSMM_SUITABLE) const double ai = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(float)); hst = ai * opencl_libsmm_shst; @@ -1453,7 +1459,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, if (CL_SUCCESS == clGetDeviceInfo(active_device, (cl_device_info)(dbcsr_type_real_8 == datatype ? 0x4232 : 0x4231), sizeof(cl_bitfield), &fp_atomics, NULL) && - 0 != (/*add*/ (1 << 1) & fp_atomics)) { + 0 != (/*add*/ (1 << 1) & fp_atomics)) + { extensions[1] = "cl_ext_float_atomics"; atomic_exp = (dbcsr_type_real_8 == datatype ? "atomic_fetch_add_explicit((global volatile atomic_double*)A, B, " @@ -1464,7 +1471,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, else if ((0 != c_dbcsr_acc_opencl_config.devinfo.intel_id && 0x4905 != c_dbcsr_acc_opencl_config.devinfo.intel_id && 0 == c_dbcsr_acc_opencl_config.devinfo.unified) || - 0 != atomics_native) { + 0 != atomics_native) + { if (dbcsr_type_real_4 == datatype || 0x0bd5 == c_dbcsr_acc_opencl_config.devinfo.intel_id || 0 != atomics_native) { if (0 == atomics_native && 0x0bd5 != c_dbcsr_acc_opencl_config.devinfo.intel_id) { @@ -1486,7 +1494,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, else if (cl_nonv) { if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && (0 == (m_max & 1) || (0 == c_dbcsr_acc_opencl_config.devinfo.intel_id /*&& cl_nonv*/)) /* TODO */ - && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) { + && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) + { assert(dbcsr_type_real_4 == datatype); atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\""; } @@ -1505,7 +1514,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && (0 == (m_max & 1) || (0 == c_dbcsr_acc_opencl_config.devinfo.intel_id && cl_nonv)) /* TODO */ && '2' == env_atomics[strlen(env_atomics) - 1] && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) { + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) + { assert(dbcsr_type_real_4 == datatype); atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\""; } @@ -1674,7 +1684,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, void* scratch = NULL; if (CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_a_data), CL_MEM_SIZE, sizeof(size_t), &asize, NULL) && CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_b_data), CL_MEM_SIZE, sizeof(size_t), &bsize, NULL) && - CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_c_data), CL_MEM_SIZE, sizeof(size_t), &csize, NULL)) { + CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_c_data), CL_MEM_SIZE, sizeof(size_t), &csize, NULL)) + { const double alpha = 1, beta = 1; libxsmm_descriptor_blob blob; libxsmm_gemm_descriptor* const desc = libxsmm_gemm_descriptor_dinit(