diff --git a/examples/c/vector_add.py b/examples/c/vector_add.py
index c5956f2ad..38122b49d 100755
--- a/examples/c/vector_add.py
+++ b/examples/c/vector_add.py
@@ -26,7 +26,7 @@
 }
 """
 
-size = 72*1024*1024
+size = 72 * 1024 * 1024
 
 a = numpy.random.randn(size).astype(numpy.float32)
 b = numpy.random.randn(size).astype(numpy.float32)
@@ -39,7 +39,6 @@
 tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
 tune_params["vecsize"] = [1, 2, 4, 8, 16]
 
-answer = [a+b, None, None, None]
+answer = [a + b, None, None, None]
 
-tune_kernel("vector_add", kernel_string, size, args, tune_params,
-    answer=answer, compiler_options=['-O3'])
+tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py
new file mode 100644
index 000000000..d6fc1fba4
--- /dev/null
+++ b/examples/directives/histogram_c_openacc.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
+#endif
+#pragma acc loop independent
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma acc atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenACC(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["ngangs"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
+)
diff --git a/examples/directives/histogram_c_openmp.py b/examples/directives/histogram_c_openmp.py
new file mode 100644
index 000000000..190c35789
--- /dev/null
+++ b/examples/directives/histogram_c_openmp.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
+import numpy as np
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+
+# Naive Python histogram implementation
+def histogram(vector, hist):
+    for i in range(0, len(vector)):
+        hist[vector[i]] += 1
+    return hist
+
+
+code = """
+#include <stdlib.h>
+
+#define HIST_SIZE 256
+#define VECTOR_SIZE 1000000
+
+#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
+#if enable_reduction == 1
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
+#else
+#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
+#endif
+for ( int i = 0; i < VECTOR_SIZE; i++ ) {
+#if enable_atomic == 1
+    #pragma omp atomic update
+#endif
+    hist[vector[i]] += 1;
+}
+#pragma tuner stop
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["nteams"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+tune_params["enable_reduction"] = [0, 1]
+tune_params["enable_atomic"] = [0, 1]
+constraints = ["enable_reduction != enable_atomic"]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
+kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
+reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
+reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
+answer = [None, reference_hist]
+
+tune_kernel(
+    "histogram",
+    kernel_string["histogram"],
+    0,
+    kernel_args["histogram"],
+    tune_params,
+    restrictions=constraints,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py
index d8bf7cc4e..96b94a1d3 100644
--- a/examples/directives/matrix_multiply_c_openacc.py
+++ b/examples/directives/matrix_multiply_c_openacc.py
@@ -1,13 +1,8 @@
 #!/usr/bin/env python
 """This is an example tuning a naive matrix multiplication using the simplified directives interface"""
 
-from kernel_tuner import tune_kernel
-from kernel_tuner.utils.directives import (
-    Code,
-    OpenACC,
-    Cxx,
-    process_directives
-)
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives
 
 N = 4096
 
@@ -45,6 +40,12 @@
 metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
 
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
 tune_kernel(
     "mm",
     kernel_string["mm"],
@@ -52,6 +53,7 @@
     kernel_args["mm"],
     tune_params,
     metrics=metrics,
-    compiler_options=["-fast", "-acc=gpu"],
+    answer=answer,
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
diff --git a/examples/directives/matrix_multiply_c_openmp.py b/examples/directives/matrix_multiply_c_openmp.py
new file mode 100644
index 000000000..3298ffb7b
--- /dev/null
+++ b/examples/directives/matrix_multiply_c_openmp.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
+
+from kernel_tuner import tune_kernel, run_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+N = 4096
+
+code = """
+#define N 4096
+
+void matrix_multiply(float *A, float *B, float *C) {
+    #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
+    float temp_sum = 0.0f;
+    #pragma omp target
+    #pragma omp teams distribute collapse(2)
+    for ( int i = 0; i < N; i++) {
+        for ( int j = 0; j < N; j++ ) {
+            temp_sum = 0.0f;
+            #pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
+            for ( int k = 0; k < N; k++ ) {
+                temp_sum += A[(i * N) + k] * B[(k * N) + j];
+            }
+            C[(i * N) + j] = temp_sum;
+        }
+    }
+    #pragma tuner stop
+}
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+dims = {"NN": N**2}
+kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
+
+tune_params = dict()
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["time_s"] = lambda x: x["time"] / 10**3
+metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
+metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
+
+# compute reference solution from CPU
+results = run_kernel(
+    "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
+)
+answer = [None, None, results[2]]
+
+tune_kernel(
+    "mm",
+    kernel_string["mm"],
+    0,
+    kernel_args["mm"],
+    tune_params,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/vector_add_c_openacc.py b/examples/directives/vector_add_c_openacc.py
index c062f2b1e..4730f3fd5 100644
--- a/examples/directives/vector_add_c_openacc.py
+++ b/examples/directives/vector_add_c_openacc.py
@@ -67,6 +67,6 @@
     tune_params,
     metrics=metrics,
     answer=answer,
-    compiler_options=["-fast", "-acc=gpu"],
     compiler="nvc++",
+    compiler_options=["-fast", "-acc=gpu"],
 )
diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py
new file mode 100644
index 000000000..903243750
--- /dev/null
+++ b/examples/directives/vector_add_c_openmp.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives
+
+code = """
+#include <stdlib.h>
+
+#define VECTOR_SIZE 1000000
+
+int main(void) {
+	int size = VECTOR_SIZE;
+	float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
+	float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
+	float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
+
+	#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
+	#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
+	for ( int i = 0; i < size; i++ ) {
+		c[i] = a[i] + b[i];
+	}
+	#pragma tuner stop
+
+	free(a);
+	free(b);
+	free(c);
+}
+"""
+
+# Extract tunable directive
+app = Code(OpenMP(), Cxx())
+kernel_string, kernel_args = process_directives(app, code)
+
+tune_params = dict()
+tune_params["nteams"] = [2**i for i in range(1, 11)]
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["GB/s"] = (
+    lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0])))
+    / (x["time"] / 10**3)
+    / 10**9
+)
+
+answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None]
+
+tune_kernel(
+    "vector_add",
+    kernel_string["vector_add"],
+    0,
+    kernel_args["vector_add"],
+    tune_params,
+    metrics=metrics,
+    answer=answer,
+    compiler="nvc++",
+    compiler_options=["-fast", "-mp=gpu"],
+)
diff --git a/examples/directives/vector_add_fortran_openacc.py b/examples/directives/vector_add_fortran_openacc.py
index 29e94646a..503a4538e 100644
--- a/examples/directives/vector_add_fortran_openacc.py
+++ b/examples/directives/vector_add_fortran_openacc.py
@@ -62,6 +62,6 @@
     tune_params,
     metrics=metrics,
     answer=answer,
-    compiler_options=["-fast", "-acc=gpu"],
     compiler="nvfortran",
+    compiler_options=["-fast", "-acc=gpu"],
 )
diff --git a/examples/fortran/test_fortran_vector_add.py b/examples/fortran/test_fortran_vector_add.py
deleted file mode 100755
index 415647024..000000000
--- a/examples/fortran/test_fortran_vector_add.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-"""This is a minimal example for calling Fortran functions"""
-
-import json
-from pathlib import Path
-import numpy as np
-from kernel_tuner import run_kernel
-
-
-def test():
-    filename = Path(__file__).parent / "vector_add.F90"
-    with open(filename, "r") as f:
-        kernel_string = f.read()
-
-    size = 10000000
-
-    a = np.random.randn(size).astype(np.float32)
-    b = np.random.randn(size).astype(np.float32)
-    c = np.zeros_like(b)
-    n = np.int32(size)
-
-    args = [c, a, b, n]
-
-    tune_params = dict()
-    tune_params["N"] = size
-    tune_params["NTHREADS"] = 4
-
-    answer = run_kernel(
-        "vector_add",
-        kernel_string,
-        size,
-        args,
-        tune_params,
-        lang="fortran",
-        compiler="gfortran",
-    )
-
-    assert np.allclose(answer[0], a + b, atol=1e-8)
-
-
-if __name__ == "__main__":
-    test()
diff --git a/examples/fortran/vector_add.F90 b/examples/fortran/vector_add.F90
index b5541b59c..6d39752b5 100644
--- a/examples/fortran/vector_add.F90
+++ b/examples/fortran/vector_add.F90
@@ -17,11 +17,11 @@ subroutine vector_add(C, A, B, n)
     real (c_float), intent(in), dimension(N) :: A, B
     integer (c_int), intent(in) :: n
 
-    !$OMP parallel do
+    !$omp parallel do
     do i = 1, N
       C(i) = A(i) + B(i)
     end do
-    !$OMP end parallel do
+    !$omp end parallel do
 
 end subroutine vector_add
 
diff --git a/examples/fortran/vector_add.py b/examples/fortran/vector_add.py
index 5c1a5476b..813f6671e 100755
--- a/examples/fortran/vector_add.py
+++ b/examples/fortran/vector_add.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 """This is a minimal example for calling Fortran functions"""
 
-from __future__ import print_function
 import numpy as np
 from kernel_tuner import tune_kernel
 
@@ -29,6 +28,7 @@ def tune():
         tune_params,
         lang="C",
         compiler="gfortran",
+        compiler_options=["-fopenmp"],
     )
 
     return result
diff --git a/examples/fortran/vector_add_acc.F90 b/examples/fortran/vector_add_acc.F90
deleted file mode 100644
index 9188e74cc..000000000
--- a/examples/fortran/vector_add_acc.F90
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef N
-#define N 1024
-#endif
-#ifndef block_size_x
-#define block_size_x 256
-#endif
-
-module vector
-use iso_c_binding
-use omp_lib
-
-contains
-
-subroutine vector_add(C, A, B, n)
-    use iso_c_binding
-    real (c_float), intent(out), dimension(N) :: C
-    real (c_float), intent(in), dimension(N) :: A, B
-    integer (c_int), value, intent(in) :: n
-
-    !$acc data copyin(A, B) copyout(C)
-
-    !$acc parallel loop device_type(nvidia) vector_length(block_size_x)
-    do i = 1, n
-      C(i) = A(i) + B(i)
-    end do
-    !$acc end parallel loop
-
-    !$acc end data
-
-end subroutine vector_add
-
-
-
-function time_vector_add(C, A, B, n) result(time)
-    use iso_c_binding
-    real (c_float), intent(out), dimension(N) :: C
-    real (c_float), intent(in), dimension(N) :: A, B
-    integer (c_int), value, intent(in) :: n
-    real (c_float) :: time
-    real (c_double) start_time, end_time
-
-    start_time = omp_get_wtime()
-
-    call vector_add(C, A, B, n)
-
-    end_time = omp_get_wtime()
-    time = (end_time - start_time)*1e3
-
-end function time_vector_add
-
-
-
-end module vector
diff --git a/examples/fortran/vector_add_acc.py b/examples/fortran/vector_add_acc.py
deleted file mode 100755
index ff84c8b6e..000000000
--- a/examples/fortran/vector_add_acc.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-"""This is a minimal example for calling Fortran functions"""
-
-import numpy as np
-from kernel_tuner import tune_kernel
-
-
-def tune():
-    size = int(72 * 1024 * 1024)
-
-    a = np.random.randn(size).astype(np.float32)
-    b = np.random.randn(size).astype(np.float32)
-    c = np.zeros_like(b)
-    n = np.int32(size)
-
-    args = [c, a, b, n]
-
-    tune_params = dict()
-    tune_params["N"] = [size]
-    tune_params["block_size_x"] = [32, 64, 128, 256, 512]
-
-    result, env = tune_kernel(
-        "time_vector_add",
-        "vector_add_acc.F90",
-        size,
-        args,
-        tune_params,
-        lang="C",
-        compiler="nvfortran",
-        compiler_options=["-fast", "-acc=gpu"],
-    )
-
-    return result
-
-
-if __name__ == "__main__":
-    tune()
diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py
index a37c9d6e7..586c3204f 100644
--- a/kernel_tuner/backends/backend.py
+++ b/kernel_tuner/backends/backend.py
@@ -57,6 +57,11 @@ def memcpy_htod(self, dest, src):
         """This method must implement a host to device copy."""
         pass
 
+    @abstractmethod
+    def refresh_memory(self, device_memory, host_arguments, should_sync):
+        """This method must implement refreshing the device memory with a clean copy."""
+        pass
+
 
 class GPUBackend(Backend):
     """Base class for GPU backends"""
@@ -80,6 +85,12 @@ def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         pass
 
+    def refresh_memory(self, gpu_memory, host_arguments, should_sync):
+        """Refresh the GPU memory with the untouched host arguments."""
+        for i, arg in enumerate(host_arguments):
+            if should_sync[i]:
+                self.memcpy_htod(gpu_memory[i], arg)
+
 
 class CompilerBackend(Backend):
     """Base class for compiler backends"""
@@ -87,3 +98,8 @@ class CompilerBackend(Backend):
     @abstractmethod
     def __init__(self, iterations, compiler_options, compiler):
         pass
+
+    @abstractmethod
+    def cleanup_lib(self):
+        """Unload the previously loaded shared library"""
+        pass
diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
index 730710489..06402ff7c 100644
--- a/kernel_tuner/backends/compiler.py
+++ b/kernel_tuner/backends/compiler.py
@@ -34,7 +34,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    Pointer = Exception # using Exception here as a type that will never be among kernel arguments
+    Pointer = Exception  # using Exception here as a type that will never be among kernel arguments
     DeviceArray = Exception
 
 
@@ -93,8 +93,12 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers
         :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
         :type iterations: int
         """
+        # allocations contains a clean copy of the memory
+        self.allocations = []
         self.observers = observers or []
         self.observers.append(CompilerRuntimeObserver(self))
+        for obs in self.observers:
+            obs.register_device(self)
 
         self.iterations = iterations
         self.max_threads = 1024
@@ -104,7 +108,6 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers
         self.lib = None
         self.using_openmp = False
         self.using_openacc = False
-        self.observers = [CompilerRuntimeObserver(self)]
         self.last_result = None
 
         if self.compiler == "g++":
@@ -157,30 +160,24 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
-                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
+                raise TypeError(
+                    f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
+                )
             dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
-                    # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference
-                    # to its underlying array, so we need to store a reference to arg.copy()
-                    # in the Argument object manually to avoid it being deleted.
-                    # (This changed in numpy > 1.15.)
-                    # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
                     data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
-                    numpy_arg = arg
                 else:
                     raise TypeError("unknown dtype for ndarray")
             elif isinstance(arg, np.generic):
                 data_ctypes = dtype_map[dtype_str](arg)
-                numpy_arg = arg
             elif is_cupy_array(arg):
                 data_ctypes = C.c_void_p(arg.data.ptr)
-                numpy_arg = arg
             elif isinstance(arg, DeviceArray):
                 data_ctypes = arg.as_c_void_p()
-                numpy_arg = None
 
-            ctype_args[i] = Argument(numpy=numpy_arg, ctypes=data_ctypes)
+            ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes)
+            self.allocations.append(Argument(numpy=arg.copy(), ctypes=data_ctypes))
         return ctype_args
 
     def compile(self, kernel_instance):
@@ -204,16 +201,13 @@ def compile(self, kernel_instance):
         compiler_options = ["-fPIC"]
 
         # detect openmp
-        if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string:
+        if "#pragma omp" in kernel_string or "!$omp" in kernel_string:
             logging.debug("set using_openmp to true")
             self.using_openmp = True
-            if self.compiler in ["nvc", "nvc++", "nvfortran"]:
-                compiler_options.append("-mp")
-            else:
-                compiler_options.append("-fopenmp")
 
         # detect openacc
         if "#pragma acc" in kernel_string or "!$acc" in kernel_string:
+            logging.debug("set using_openacc to true")
             self.using_openacc = True
 
         # if filename is known, use that one
@@ -288,7 +282,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             subprocess.run(
@@ -299,7 +293,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -390,7 +384,8 @@ def memset(self, allocation, value, size):
             C.memset(allocation.ctypes, value, size)
 
     def memcpy_dtoh(self, dest, src):
-        """a simple memcpy copying from an Argument to a numpy array
+        """This method implements the semantic of a device to host copy for the Compiler backend.
+        There is no actual copy from device to host happening, but host to host.
 
         :param dest: A numpy or cupy array to store the data
         :type dest: np.ndarray or cupy.ndarray
@@ -404,6 +399,7 @@ def memcpy_dtoh(self, dest, src):
             # This is because DeviceArray manages its own memory and donesn't need
             # explicit copies like numpy arrays do
             return
+        # there is no real copy from device to host, but host to host
         if isinstance(dest, np.ndarray) and is_cupy_array(src.numpy):
             # Implicit conversion to a NumPy array is not allowed.
             value = src.numpy.get()
@@ -413,33 +409,20 @@ def memcpy_dtoh(self, dest, src):
         dest[:] = xp.asarray(value)
 
     def memcpy_htod(self, dest, src):
-        """a simple memcpy copying from a numpy array to an Argument
-
-        :param dest: An Argument for some memory allocation
-        :type dest: Argument
+        """There is no memcpy_htod implemented for the compiler backend."""
+        pass
 
-        :param src: A numpy or cupy array containing the source data
-        :type src: np.ndarray or cupy.ndarray
-        """
-        # If src.numpy is None, it means we're dealing with a HIP Python DeviceArray
-        if dest.numpy is None:
-            # Skip memory copies for HIP Python DeviceArray
-            # This is because DeviceArray manages its own memory and donesn't need
-            # explicit copies like numpy arrays do
-            return
-        if isinstance(dest.numpy, np.ndarray) and is_cupy_array(src):
-            # Implicit conversion to a NumPy array is not allowed.
-            value = src.get()
-        else:
-            value = src
-        xp = get_array_module(dest.numpy)
-        dest.numpy[:] = xp.asarray(value)
+    def refresh_memory(self, _, arguments, should_sync):
+        """Copy the preserved content of the output memory to used arrays."""
+        for i, arg in enumerate(arguments):
+            if should_sync[i]:
+                self.memcpy_dtoh(arg, self.allocations[i])
 
     def cleanup_lib(self):
         """unload the previously loaded shared library"""
         if self.lib is None:
             return
-        
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 3086983ec..3c3ba792b 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -84,9 +84,7 @@ def __init__(self, kernel_name, kernel_sources, lang, defines=None):
         self.defines = defines
         if lang is None:
             if callable(self.kernel_sources[0]):
-                raise TypeError(
-                    "Please specify language when using a code generator function"
-                )
+                raise TypeError("Please specify language when using a code generator function")
             kernel_string = self.get_kernel_string(0)
             lang = util.detect_language(kernel_string)
 
@@ -113,9 +111,7 @@ def get_kernel_string(self, index=0, params=None):
         kernel_source = self.kernel_sources[index]
         return util.get_kernel_string(kernel_source, params)
 
-    def prepare_list_of_files(
-        self, kernel_name, params, grid, threads, block_size_names
-    ):
+    def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_names):
         """prepare the kernel string along with any additional files
 
         The first file in the list is allowed to include or read in the others
@@ -151,9 +147,7 @@ def prepare_list_of_files(
 
         for i, f in enumerate(self.kernel_sources):
             if i > 0 and not util.looks_like_a_filename(f):
-                raise ValueError(
-                    "When passing multiple kernel sources, the secondary entries must be filenames"
-                )
+                raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames")
 
             ks = self.get_kernel_string(i, params)
             # add preprocessor statements
@@ -187,9 +181,7 @@ def prepare_list_of_files(
 
     def get_user_suffix(self, index=0):
         """Get the suffix of the kernel filename, if the user specified one. Return None otherwise."""
-        if util.looks_like_a_filename(self.kernel_sources[index]) and (
-            "." in self.kernel_sources[index]
-        ):
+        if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]):
             return "." + self.kernel_sources[index].split(".")[-1]
         return None
 
@@ -218,13 +210,9 @@ def check_argument_lists(self, kernel_name, arguments):
         """
         for i, f in enumerate(self.kernel_sources):
             if not callable(f):
-                util.check_argument_list(
-                    kernel_name, self.get_kernel_string(i), arguments
-                )
+                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
             else:
-                logging.debug(
-                    "Checking of arguments list not supported yet for code generators."
-                )
+                logging.debug("Checking of arguments list not supported yet for code generators.")
 
 
 class DeviceInterface(object):
@@ -308,6 +296,7 @@ def __init__(
                 compiler=compiler,
                 compiler_options=compiler_options,
                 iterations=iterations,
+                observers=observers,
             )
         elif lang.upper() == "HIP":
             dev = HipFunctions(
@@ -317,7 +306,9 @@ def __init__(
                 observers=observers,
             )
         else:
-            raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
+            raise ValueError(
+                "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
+            )
         self.dev = dev
 
         # look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
@@ -458,9 +449,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                     obs.results = result
                     duration = max(duration, obs.continuous_duration)
 
-                self.benchmark_continuous(
-                    func, gpu_args, instance.threads, instance.grid, result, duration
-                )
+                self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration)
 
         except Exception as e:
             # some launches may fail because too many registers are required
@@ -473,9 +462,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 "INVALID_WORK_GROUP_SIZE",
             ]
             if any([skip_str in str(e) for skip_str in skippable_exceptions]):
-                logging.debug(
-                    "benchmark fails due to runtime failure too many resources required"
-                )
+                logging.debug("benchmark fails due to runtime failure too many resources required")
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch"
@@ -487,13 +474,11 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 raise e
         return result
 
-    def check_kernel_output(
-        self, func, gpu_args, instance, answer, atol, verify, verbose
-    ):
+    def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose):
         """runs the kernel once and checks the result against answer"""
         logging.debug("check_kernel_output")
 
-        #if not using custom verify function, check if the length is the same
+        # if not using custom verify function, check if the length is the same
         if answer:
             if len(instance.arguments) != len(answer):
                 raise TypeError("The length of argument list and provided results do not match.")
@@ -504,14 +489,13 @@ def check_kernel_output(
 
         # re-copy original contents of output arguments to GPU memory, to overwrite any changes
         # by earlier kernel runs
-        for i, arg in enumerate(instance.arguments):
-            if should_sync[i]:
-                self.dev.memcpy_htod(gpu_args[i], arg)
+        self.dev.refresh_memory(gpu_args, instance.arguments, should_sync)
 
         # run the kernel
         check = self.run_kernel(func, gpu_args, instance)
         if not check:
-            return  # runtime failure occurred that should be ignored, skip correctness check
+            # runtime failure occured that should be ignored, skip correctness check
+            return
 
         # retrieve gpu results to host memory
         result_host = []
@@ -522,7 +506,7 @@ def check_kernel_output(
                     self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
                 elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
                     if not answer[i].is_cuda:
-                        #if the answer is on the host, copy gpu output to host as well
+                        # if the answer is on the host, copy gpu output to host as well
                         result_host.append(torch.zeros_like(answer[i]))
                         self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
                     else:
@@ -550,10 +534,7 @@ def check_kernel_output(
             correct = True
 
         if not correct:
-            raise RuntimeError(
-                "Kernel result verification failed for: "
-                + util.get_config_string(instance.params)
-            )
+            raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
     def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         # reset previous timers
@@ -567,7 +548,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         # Compile and benchmark a kernel instance based on kernel strings and parameters
         instance_string = util.get_instance_string(params)
 
-        logging.debug('compile_and_benchmark ' + instance_string)
+        logging.debug("compile_and_benchmark " + instance_string)
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
@@ -585,9 +566,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 else:
                     # add shared memory arguments to compiled module
                     if kernel_options.smem_args is not None:
-                        self.dev.copy_shared_memory_args(
-                            util.get_smem_args(kernel_options.smem_args, params)
-                        )
+                        self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
                     # add constant memory arguments to compiled module
                     if kernel_options.cmem_args is not None:
                         self.dev.copy_constant_memory_args(kernel_options.cmem_args)
@@ -601,12 +580,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 # test kernel for correctness
                 if func and (to.answer or to.verify or self.output_observers):
                     start_verification = time.perf_counter()
-                    self.check_kernel_output(
-                        func, gpu_args, instance, to.answer, to.atol, to.verify, verbose
-                    )
-                    last_verification_time = 1000 * (
-                        time.perf_counter() - start_verification
-                    )
+                    self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
+                    last_verification_time = 1000 * (time.perf_counter() - start_verification)
 
                 # benchmark
                 if func:
@@ -622,10 +597,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             except Exception as e:
                 # dump kernel sources to temp file
                 temp_filenames = instance.prepare_temp_files_for_error_msg()
-                print(
-                    "Error while compiling or benchmarking, see source files: "
-                    + " ".join(temp_filenames)
-                )
+                print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
                 raise e
 
             # clean up any temporary files, if no error occurred
@@ -671,7 +643,7 @@ def compile_kernel(self, instance, verbose):
 
     @staticmethod
     def preprocess_gpu_arguments(old_arguments, params):
-        """ Get a flat list of arguments based on the configuration given by `params` """
+        """Get a flat list of arguments based on the configuration given by `params`"""
         return _preprocess_gpu_arguments(old_arguments, params)
 
     def copy_shared_memory_args(self, smem_args):
@@ -707,9 +679,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
         if np.prod(threads) > self.dev.max_threads:
             if verbose:
-                print(
-                    f"skipping config {util.get_instance_string(params)} reason: too many threads per block"
-                )
+                print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block")
             return util.InvalidConfig()
 
         # obtain the kernel_string and prepare additional files, if any
@@ -728,7 +698,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
         arguments = _preprocess_gpu_arguments(kernel_options.arguments, params)
 
-        #collect everything we know about this instance and return it
+        # collect everything we know about this instance and return it
         return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
 
     def get_environment(self):
@@ -775,12 +745,8 @@ def run_kernel(self, func, gpu_args, instance):
         try:
             self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid)
         except Exception as e:
-            if "too many resources requested for launch" in str(
-                e
-            ) or "OUT_OF_RESOURCES" in str(e):
-                logging.debug(
-                    "ignoring runtime failure due to too many resources required"
-                )
+            if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e):
+                logging.debug("ignoring runtime failure due to too many resources required")
                 return False
             else:
                 logging.debug("encountered unexpected runtime failure: " + str(e))
@@ -789,7 +755,7 @@ def run_kernel(self, func, gpu_args, instance):
 
 
 def _preprocess_gpu_arguments(old_arguments, params):
-    """ Get a flat list of arguments based on the configuration given by `params` """
+    """Get a flat list of arguments based on the configuration given by `params`"""
     new_arguments = []
 
     for argument in old_arguments:
@@ -806,9 +772,7 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
 
     # first check if the length is the same
     if len(instance.arguments) != len(answer):
-        raise TypeError(
-            "The length of argument list and provided results do not match."
-        )
+        raise TypeError("The length of argument list and provided results do not match.")
     # for each element in the argument list, check if the types match
     for i, arg in enumerate(instance.arguments):
         if answer[i] is not None:  # skip None elements in the answer list
@@ -862,16 +826,14 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
                     )
             else:
                 # either answer[i] and argument have different types or answer[i] is not a numpy type
-                if not isinstance(
-                    answer[i], (np.ndarray, cp.ndarray, torch.Tensor)
-                ) or not isinstance(answer[i], np.number):
+                if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance(
+                    answer[i], np.number
+                ):
                     raise TypeError(
                         f"Element {i} of expected results list is not a numpy/cupy ndarray, torch Tensor or numpy scalar."
                     )
                 else:
-                    raise TypeError(
-                        f"Element {i} of expected results list and kernel arguments have different types."
-                    )
+                    raise TypeError(f"Element {i} of expected results list and kernel arguments have different types.")
 
     def _ravel(a):
         if hasattr(a, "ravel") and len(a.shape) > 1:
@@ -891,9 +853,7 @@ def _flatten(a):
             expected = _flatten(expected)
             if any([isinstance(array, cp.ndarray) for array in [expected, result]]):
                 output_test = cp.allclose(expected, result, atol=atol)
-            elif isinstance(expected, torch.Tensor) and isinstance(
-                result, torch.Tensor
-            ):
+            elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor):
                 output_test = torch.allclose(expected, result, atol=atol)
             else:
                 output_test = np.allclose(expected, result, atol=atol)
@@ -934,7 +894,7 @@ def split_argument_list(argument_list):
         match = re.match(regex, arg, re.S)
         if not match:
             raise ValueError("error parsing templated kernel argument list")
-        type_list.append(re.sub(r"\s+", " ", match.group(1).strip(), flags=re.S))
+        type_list.append(re.sub(r"\s+", " ", match.group(1).strip(), re.S))
         name_list.append(match.group(2).strip())
     return type_list, name_list
 
@@ -945,11 +905,7 @@ def apply_template_typenames(type_list, templated_typenames):
     def replace_typename_token(matchobj):
         """function for a whitespace preserving token regex replace"""
         # replace only the match, leaving the whitespace around it as is
-        return (
-            matchobj.group(1)
-            + templated_typenames[matchobj.group(2)]
-            + matchobj.group(3)
-        )
+        return matchobj.group(1) + templated_typenames[matchobj.group(2)] + matchobj.group(3)
 
     for i, arg_type in enumerate(type_list):
         for k, v in templated_typenames.items():
@@ -980,9 +936,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
     # relatively strict regex that does not allow nested template parameters like vector<TF>
     # within the template parameter list
     regex = (
-        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?"
-        + name
-        + r"\s*\((.*?)\)\s*\{"
+        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{"
     )
     match = re.search(regex, kernel_string, re.S)
     if not match:
@@ -990,15 +944,12 @@ def wrap_templated_kernel(kernel_string, kernel_name):
 
     template_parameters = match.group(1).split(",")
     argument_list = match.group(3).split(",")
-    argument_list = [
-        s.strip() for s in argument_list
-    ]  # remove extra whitespace around 'type name' strings
+    # remove extra whitespace around 'type name' strings
+    argument_list = [s.strip() for s in argument_list]
 
     type_list, name_list = split_argument_list(argument_list)
 
-    templated_typenames = get_templated_typenames(
-        template_parameters, template_arguments
-    )
+    templated_typenames = get_templated_typenames(template_parameters, template_arguments)
     apply_template_typenames(type_list, templated_typenames)
 
     # replace __global__ with __device__ in the templated kernel definition
@@ -1012,9 +963,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
         launch_bounds = match.group(2)
 
     # generate code for the compile-time template instantiation
-    template_instantiation = (
-        f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
-    )
+    template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
 
     # generate code for the wrapper kernel
     new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)])
diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py
index ded8c57fe..36cd219b0 100644
--- a/kernel_tuner/utils/directives.py
+++ b/kernel_tuner/utils/directives.py
@@ -56,6 +56,13 @@ def get(self) -> str:
         return "openacc"
 
 
+class OpenMP(Directive):
+    """Class to represent OpenMP"""
+
+    def get(self) -> str:
+        return "openmp"
+
+
 class Cxx(Language):
     """Class to represent C++ code"""
 
@@ -131,6 +138,11 @@ def is_openacc(directive: Directive) -> bool:
     return isinstance(directive, OpenACC)
 
 
+def is_openmp(directive: Directive) -> bool:
+    """Check if a directive is OpenMP"""
+    return isinstance(directive, OpenMP)
+
+
 def is_cxx(lang: Language) -> bool:
     """Check if language is C++"""
     return isinstance(lang, Cxx)
@@ -141,6 +153,19 @@ def is_fortran(lang: Language) -> bool:
     return isinstance(lang, Fortran)
 
 
+def line_contains(line: str, target: str) -> bool:
+    """Generic helper to check if a line contains the target"""
+    return target in line
+
+
+def directive_contains_clause(line: str, clauses: list) -> bool:
+    """Check if a directive contains one clause from a list"""
+    for clause in clauses:
+        if clause in line:
+            return True
+    return False
+
+
 def line_contains_openacc_directive(line: str, lang: Language) -> bool:
     """Check if line contains an OpenACC directive or not"""
     if is_cxx(lang):
@@ -160,6 +185,25 @@ def line_contains_openacc_directive_fortran(line: str) -> bool:
     return line_contains(line, "!$acc")
 
 
+def line_contains_openmp_directive(line: str, lang: Language) -> bool:
+    """Check if line contains an OpenMP directive or not"""
+    if is_cxx(lang):
+        return line_contains_openmp_directive_cxx(line)
+    elif is_fortran(lang):
+        return line_contains_openmp_directive_fortran(line)
+    return False
+
+
+def line_contains_openmp_directive_cxx(line: str) -> bool:
+    """Check if a line of code contains a C++ OpenMP directive or not"""
+    return line_contains(line, "#pragma omp")
+
+
+def line_contains_openmp_directive_fortran(line: str) -> bool:
+    """Check if a line of code contains a Fortran OpenMP directive or not"""
+    return line_contains(line, "!$omp")
+
+
 def line_contains_openacc_parallel_directive(line: str, lang: Language) -> bool:
     """Check if line contains an OpenACC parallel directive or not"""
     if is_cxx(lang):
@@ -179,23 +223,35 @@ def line_contains_openacc_parallel_directive_fortran(line: str) -> bool:
     return line_contains(line, "!$acc parallel")
 
 
-def line_contains(line: str, target: str) -> bool:
-    """Generic helper to check if a line contains the target"""
-    return target in line
+def line_contains_openmp_target_directive(line: str, lang: Language) -> bool:
+    """Check if line contains an OpenMP target directive or not"""
+    if is_cxx(lang):
+        return line_contains_openmp_target_directive_cxx(line)
+    elif is_fortran(lang):
+        return line_contains_openmp_target_directive_fortran(line)
+    return False
 
 
-def openacc_directive_contains_clause(line: str, clauses: list) -> bool:
-    """Check if an OpenACC directive contains one clause from a list"""
-    for clause in clauses:
-        if clause in line:
-            return True
-    return False
+def line_contains_openmp_target_directive_cxx(line: str) -> bool:
+    """Check if a line of code contains a C++ OpenMP target directive or not"""
+    return line_contains(line, "#pragma omp target")
+
+
+def line_contains_openmp_target_directive_fortran(line: str) -> bool:
+    """Check if a line of code contains a Fortran OpenMP target directive or not"""
+    return line_contains(line, "!$omp target")
 
 
 def openacc_directive_contains_data_clause(line: str) -> bool:
     """Check if an OpenACC directive contains one data clause"""
     data_clauses = ["copy", "copyin", "copyout", "create", "no_create", "present", "device_ptr", "attach"]
-    return openacc_directive_contains_clause(line, data_clauses)
+    return directive_contains_clause(line, data_clauses)
+
+
+def openmp_directive_contains_data_clause(line: str) -> bool:
+    """Check if an OpenMP directive contains one data clause"""
+    data_clauses = ["map"]
+    return directive_contains_clause(line, data_clauses)
 
 
 def create_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str:
@@ -223,6 +279,29 @@ def create_data_directive_openacc_fortran(name: str, size: ArraySize) -> str:
         )
 
 
+def create_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str:
+    """Create a data directive for a given language"""
+    if is_cxx(lang):
+        return create_data_directive_openmp_cxx(name, size)
+    elif is_fortran(lang):
+        return create_data_directive_openmp_fortran(name, size)
+    return ""
+
+
+def create_data_directive_openmp_cxx(name: str, size: ArraySize) -> str:
+    """Create C++ OpenMP code to allocate and copy data"""
+    return f"#pragma omp target enter data map(to: {name}[:{size.get()}])\n"
+
+
+def create_data_directive_openmp_fortran(name: str, size: ArraySize) -> str:
+    """Create Fortran OpenMP code to allocate and copy data"""
+    if len(size) == 1:
+        return f"!$omp target enter data map(to: {name}(:{size.get()}))\n"
+    else:
+        md_size = fortran_md_size(size)
+        return f"!$omp target enter data map(to: {name}({','.join(md_size)}))\n"
+
+
 def exit_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str:
     """Create code to copy data back for a given language"""
     if is_cxx(lang):
@@ -246,6 +325,29 @@ def exit_data_directive_openacc_fortran(name: str, size: ArraySize) -> str:
         return f"!$acc exit data copyout({name}({','.join(md_size)}))\n"
 
 
+def exit_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str:
+    """Create code to copy data back for a given language"""
+    if is_cxx(lang):
+        return exit_data_directive_openmp_cxx(name, size)
+    elif is_fortran(lang):
+        return exit_data_directive_openmp_fortran(name, size)
+    return ""
+
+
+def exit_data_directive_openmp_cxx(name: str, size: ArraySize) -> str:
+    """Create C++ OpenMP code to copy back data"""
+    return f"#pragma omp target exit data map(from: {name}[:{size.get()}])\n"
+
+
+def exit_data_directive_openmp_fortran(name: str, size: ArraySize) -> str:
+    """Create Fortran OpenMP code to copy back data"""
+    if len(size) == 1:
+        return f"!$omp target exit data map(from: {name}(:{size.get()}))\n"
+    else:
+        md_size = fortran_md_size(size)
+        return f"!$omp target exit data map(from: {name}({','.join(md_size)}))\n"
+
+
 def correct_kernel(kernel_name: str, line: str) -> bool:
     """Checks if the line contains the correct kernel name"""
     return f" {kernel_name} " in line or (kernel_name in line and len(line.partition(kernel_name)[2]) == 0)
@@ -379,15 +481,38 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use
     for name in data.keys():
         if "*" in data[name][0]:
             size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions)
-            if is_openacc(langs.directive) and is_cxx(langs.language):
-                intro += create_data_directive_openacc_cxx(name, size)
-                outro += exit_data_directive_openacc_cxx(name, size)
-            elif is_openacc(langs.directive) and is_fortran(langs.language):
-                intro += create_data_directive_openacc_fortran(name, size)
-                outro += exit_data_directive_openacc_fortran(name, size)
+            temp = []
+            if is_openacc(langs.directive):
+                temp = wrap_data_openacc(name, size, langs)
+            elif is_openmp(langs.directive):
+                temp = wrap_data_openmp(name, size, langs)
+            intro += temp[0]
+            outro += temp[1]
     return "\n".join([intro, code, outro])
 
 
+def wrap_data_openacc(name: str, size: int, langs: Code) -> Tuple[str, str]:
+    """Create language specific data directives"""
+    if is_cxx(langs.language):
+        intro = create_data_directive_openacc_cxx(name, size)
+        outro = exit_data_directive_openacc_cxx(name, size)
+    elif is_fortran(langs.language):
+        intro = create_data_directive_openacc_fortran(name, size)
+        outro = exit_data_directive_openacc_fortran(name, size)
+    return intro, outro
+
+
+def wrap_data_openmp(name: str, size: int, langs: Code) -> Tuple[str, str]:
+    """Create language specific data directives"""
+    if is_cxx(langs.language):
+        intro = create_data_directive_openmp_cxx(name, size)
+        outro = exit_data_directive_openmp_cxx(name, size)
+    elif is_fortran(langs.language):
+        intro = create_data_directive_openmp_fortran(name, size)
+        outro = exit_data_directive_openmp_fortran(name, size)
+    return intro, outro
+
+
 def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict:
     """Extract explicitly marked directive sections from code"""
     if is_cxx(langs.language):
@@ -472,7 +597,7 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None)
                     p_type = param[1:-1]
                     p_type = p_type.split(":")[0]
                     if "*" in p_type:
-                        p_type = p_type.replace("*", " * restrict")
+                        p_type = p_type.replace("*", " *")
                     if is_cxx(langs.language):
                         params.append(f"{p_type} {p_name}")
                     elif is_fortran(langs.language):
diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
index 913fee85d..fe30c0b58 100644
--- a/test/test_compiler_functions.py
+++ b/test/test_compiler_functions.py
@@ -146,12 +146,6 @@ def test_byte_array_arguments():
 
     assert all(output_arg1 == arg1)
 
-    dest = np.zeros_like(arg1)
-
-    cfunc.memcpy_dtoh(dest, output[0])
-
-    assert all(dest == arg1)
-
 
 @patch("kernel_tuner.backends.compiler.subprocess")
 @patch("kernel_tuner.backends.compiler.numpy.ctypeslib")
@@ -159,9 +153,7 @@ def test_compile(npct, subprocess):
     kernel_string = "this is a fake C program"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     f = cfunc.compile(kernel_instance)
@@ -191,9 +183,7 @@ def test_compile_detects_device_code(npct, subprocess):
     kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
@@ -230,111 +220,6 @@ def test_memset():
     assert all(x == np.zeros(4))
 
 
-@skip_if_no_cupy
-def test_memcpy_dtoh():
-    import cupy as cp
-
-    a = [1, 2, 3, 4]
-    x = cp.asarray(a, dtype=np.float32)
-    x_c = C.c_void_p(x.data.ptr)
-    arg = Argument(numpy=x, ctypes=x_c)
-    output = np.zeros(len(x), dtype=x.dtype)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_dtoh(output, arg)
-
-    print(f"{type(x)=} {x=}")
-    print(f"{type(a)=} {a=}")
-    print(f"{type(output)=} {output=}")
-
-    assert all(output == a)
-    assert all(x.get() == a)
-
-
-@skip_if_no_gcc
-def test_memcpy_host_dtoh():
-    a = [1, 2, 3, 4]
-    x = np.array(a).astype(np.float32)
-    x_c = x.ctypes.data_as(C.POINTER(C.c_float))
-    arg = Argument(numpy=x, ctypes=x_c)
-    output = np.zeros_like(x)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_dtoh(output, arg)
-
-    print(a)
-    print(output)
-
-    assert all(output == a)
-    assert all(x == a)
-
-
-@skip_if_no_cupy
-def test_memcpy_device_dtoh():
-    import cupy as cp
-
-    a = [1, 2, 3, 4]
-    x = cp.asarray(a, dtype=np.float32)
-    x_c = C.c_void_p(x.data.ptr)
-    arg = Argument(numpy=x, ctypes=x_c)
-    output = cp.zeros_like(x)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_dtoh(output, arg)
-
-    print(f"{type(x)=} {x=}")
-    print(f"{type(a)=} {a=}")
-    print(f"{type(output)=} {output=}")
-
-    assert all(output.get() == a)
-    assert all(x.get() == a)
-
-
-@skip_if_no_cupy
-def test_memcpy_htod():
-    import cupy as cp
-
-    a = [1, 2, 3, 4]
-    src = np.array(a, dtype=np.float32)
-    x = cp.zeros(len(src), dtype=src.dtype)
-    x_c = C.c_void_p(x.data.ptr)
-    arg = Argument(numpy=x, ctypes=x_c)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_htod(arg, src)
-
-    assert all(arg.numpy.get() == a)
-
-
-def test_memcpy_host_htod():
-    a = [1, 2, 3, 4]
-    src = np.array(a).astype(np.float32)
-    x = np.zeros_like(src)
-    x_c = x.ctypes.data_as(C.POINTER(C.c_float))
-    arg = Argument(numpy=x, ctypes=x_c)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_htod(arg, src)
-
-    assert all(arg.numpy == a)
-
-
-@skip_if_no_cupy
-def test_memcpy_device_htod():
-    import cupy as cp
-
-    a = [1, 2, 3, 4]
-    src = cp.array(a, dtype=np.float32)
-    x = cp.zeros(len(src), dtype=src.dtype)
-    x_c = C.c_void_p(x.data.ptr)
-    arg = Argument(numpy=x, ctypes=x_c)
-
-    cfunc = CompilerFunctions()
-    cfunc.memcpy_htod(arg, src)
-
-    assert all(arg.numpy.get() == a)
-
-
 @skip_if_no_gfortran
 def test_complies_fortran_function_no_module():
     kernel_string = """
@@ -347,9 +232,7 @@ def test_complies_fortran_function_no_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions(compiler="gfortran")
     func = cfunc.compile(kernel_instance)
@@ -378,9 +261,7 @@ def test_complies_fortran_function_with_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     try:
         cfunc = CompilerFunctions(compiler="gfortran")
@@ -427,7 +308,7 @@ def env():
 @skip_if_no_openmp
 @skip_if_no_gcc
 def test_benchmark(env):
-    results, _ = kernel_tuner.tune_kernel(*env, block_size_names=["nthreads"])
+    results, _ = kernel_tuner.tune_kernel(*env, compiler_options=["-fopenmp"], block_size_names=["nthreads"])
     assert len(results) == 3
     assert all(["nthreads" in result for result in results])
     assert all(["time" in result for result in results])
@@ -487,3 +368,26 @@ def test_run_kernel():
         lang="C",
     )
     assert cp.all((a + b) == c)
+
+
+def test_refresh_memory():
+    arg1 = np.array([1, 2, 3]).astype(np.int32)
+    arguments = [arg1]
+    cfunc = CompilerFunctions()
+    _ = cfunc.ready_argument_list(arguments)
+    assert np.all(arguments[0] == [1, 2, 3])
+    arguments[0] = np.array([0, 0, 0]).astype(np.int8)
+    assert np.all(arguments[0] == [0, 0, 0])
+    cfunc.refresh_memory(None, arguments, [True])
+    assert np.all(arguments[0] == [1, 2, 3])
+
+
+def test_memcpy_dtoh():
+    arg1 = np.array([0, 5, 0, 7]).astype(np.int32)
+    arguments = [arg1]
+    cfunc = CompilerFunctions()
+    ready_arguments = cfunc.ready_argument_list(arguments)
+    output = np.array([5, 0, 7, 0]).astype(np.int32)
+    assert np.all(ready_arguments[0].numpy != output)
+    cfunc.memcpy_dtoh(output, ready_arguments[0])
+    assert np.all(ready_arguments[0].numpy == output)
diff --git a/test/test_core.py b/test/test_core.py
index a8624470e..39597b86c 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -107,11 +107,12 @@ def test_check_kernel_output(dev_func_interface):
 
     dev.check_kernel_output('func', answer, instance, answer, atol, None, True)
 
-    dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0])
+    dfi.refresh_memory.assert_called()
     dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1))
 
     print(dfi.mock_calls)
 
+    assert dfi.refresh_memory.called == 1
     assert dfi.memcpy_dtoh.called == 1
 
     for name, args, _ in dfi.mock_calls:
diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py
index d58a44421..759a20a54 100644
--- a/test/utils/test_directives.py
+++ b/test/utils/test_directives.py
@@ -4,6 +4,13 @@
 def test_is_openacc():
     assert is_openacc(OpenACC())
     assert not is_openacc(None)
+    assert not is_openacc(OpenMP())
+
+
+def test_is_openmp():
+    assert is_openmp(OpenMP())
+    assert not is_openmp(None)
+    assert not is_openmp(OpenACC())
 
 
 def test_is_cxx():
@@ -28,6 +35,16 @@ def test_line_contains_openacc_directive():
     assert not line_contains_openacc_directive(cxx_code, None)
 
 
+def test_line_contains_openmp_directive():
+    cxx_code = "int main(void) {\n#pragma omp target}"
+    f90_code = "!$omp target"
+    assert line_contains_openmp_directive(cxx_code, Cxx())
+    assert not line_contains_openmp_directive(f90_code, Cxx())
+    assert line_contains_openmp_directive(f90_code, Fortran())
+    assert not line_contains_openmp_directive(cxx_code, Fortran())
+    assert not line_contains_openmp_directive(cxx_code, None)
+
+
 def test_line_contains_openacc_parallel_directive():
     assert line_contains_openacc_parallel_directive("#pragma acc parallel wait", Cxx())
     assert line_contains_openacc_parallel_directive("!$acc parallel", Fortran())
@@ -36,11 +53,24 @@ def test_line_contains_openacc_parallel_directive():
     assert not line_contains_openacc_parallel_directive("!$acc parallel", None)
 
 
+def test_line_contains_openmp_target_directive():
+    assert line_contains_openmp_target_directive("#pragma omp target teams", Cxx())
+    assert line_contains_openmp_target_directive("!$omp target", Fortran())
+    assert not line_contains_openmp_target_directive("#pragma omp loop", Cxx())
+    assert not line_contains_openmp_target_directive("!$omp loop", Fortran())
+    assert not line_contains_openmp_target_directive("!$omp parallel", None)
+
+
 def test_openacc_directive_contains_data_clause():
     assert openacc_directive_contains_data_clause("#pragma acc parallel present(A[:1089])")
     assert not openacc_directive_contains_data_clause("#pragma acc parallel for")
 
 
+def test_openmp_directive_contains_data_clause():
+    assert openmp_directive_contains_data_clause("#pragma omp target teams map(tofrom: A[:1089])")
+    assert not openmp_directive_contains_data_clause("#pragma omp target")
+
+
 def test_create_data_directive():
     size = ArraySize()
     size.add(1024)
@@ -48,6 +78,7 @@ def test_create_data_directive():
         create_data_directive_openacc("array", size, Cxx())
         == "#pragma acc enter data create(array[:1024])\n#pragma acc update device(array[:1024])\n"
     )
+    assert create_data_directive_openmp("array", size, Cxx()) == "#pragma omp target enter data map(to: array[:1024])\n"
     size.clear()
     size.add(35)
     size.add(16)
@@ -55,18 +86,27 @@ def test_create_data_directive():
         create_data_directive_openacc("matrix", size, Fortran())
         == "!$acc enter data create(matrix(:35,:16))\n!$acc update device(matrix(:35,:16))\n"
     )
+    assert (
+        create_data_directive_openmp("matrix", size, Fortran()) == "!$omp target enter data map(to: matrix(:35,:16))\n"
+    )
     assert create_data_directive_openacc("array", size, None) == ""
+    assert create_data_directive_openmp("array", size, None) == ""
 
 
 def test_exit_data_directive():
     size = ArraySize()
     size.add(1024)
     assert exit_data_directive_openacc("array", size, Cxx()) == "#pragma acc exit data copyout(array[:1024])\n"
+    assert exit_data_directive_openmp("array", size, Cxx()) == "#pragma omp target exit data map(from: array[:1024])\n"
     size.clear()
     size.add(35)
     size.add(16)
     assert exit_data_directive_openacc("matrix", size, Fortran()) == "!$acc exit data copyout(matrix(:35,:16))\n"
+    assert (
+        exit_data_directive_openmp("matrix", size, Fortran()) == "!$omp target exit data map(from: matrix(:35,:16))\n"
+    )
     assert exit_data_directive_openacc("matrix", size, None) == ""
+    assert exit_data_directive_openmp("matrix", size, None) == ""
 
 
 def test_correct_kernel():
@@ -101,14 +141,20 @@ def test_wrap_timing():
 def test_wrap_data():
     acc_cxx = Code(OpenACC(), Cxx())
     acc_f90 = Code(OpenACC(), Fortran())
+    omp_cxx = Code(OpenMP(), Cxx())
+    omp_f90 = Code(OpenMP(), Fortran())
     code_cxx = "// this is a comment\n"
     code_f90 = "! this is a comment\n"
     data = {"array": ["int*", "size"]}
     preprocessor = ["#define size 42"]
     expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n"
     assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx
+    expected_cxx = "#pragma omp target enter data map(to: array[:42])\n\n// this is a comment\n\n#pragma omp target exit data map(from: array[:42])\n"
+    assert wrap_data(code_cxx, omp_cxx, data, preprocessor, None) == expected_cxx
     expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n"
     assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90
+    expected_f90 = "!$omp target enter data map(to: array(:42))\n\n! this is a comment\n\n!$omp target exit data map(from: array(:42))\n"
+    assert wrap_data(code_f90, omp_f90, data, preprocessor, None) == expected_f90
     data = {"matrix": ["float*", "rows,cols"]}
     preprocessor = ["#define rows 42", "#define cols 84"]
     expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n"
@@ -238,13 +284,13 @@ def test_extract_directive_signature():
     signatures = extract_directive_signature(code, acc_cxx)
     assert len(signatures) == 1
     assert (
-        "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)"
+        "float vector_add(float * a, float * b, float * c, int size)"
         in signatures["vector_add"]
     )
     signatures = extract_directive_signature(code, acc_cxx, "vector_add")
     assert len(signatures) == 1
     assert (
-        "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)"
+        "float vector_add(float * a, float * b, float * c, int size)"
         in signatures["vector_add"]
     )
     signatures = extract_directive_signature(code, acc_cxx, "vector_add_ext")