From bdaddae45749fc19825ad73797c7d5e6ea5fb92d Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Sun, 11 Feb 2024 22:17:59 -0800
Subject: [PATCH 01/17] add dummy dlpack api

---
 dlpack/LICENSE                         | 201 +++++++++++++++++++++
 dlpack/dlpack.h                        | 233 +++++++++++++++++++++++++
 simtbx/diffBragg/src/diffBragg_ext.cpp | 209 ++++++++++++++++++++++
 3 files changed, 643 insertions(+)
 create mode 100644 dlpack/LICENSE
 create mode 100644 dlpack/dlpack.h

diff --git a/dlpack/LICENSE b/dlpack/LICENSE
new file mode 100644
index 0000000000..20a9c8a7b4
--- /dev/null
+++ b/dlpack/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 by Contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dlpack/dlpack.h b/dlpack/dlpack.h
new file mode 100644
index 0000000000..ef6960b23a
--- /dev/null
+++ b/dlpack/dlpack.h
@@ -0,0 +1,233 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 70
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+// NB: This is the only difference from
+// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to
+// allow forward declaration of DLDevice.
+typedef struct DLDevice_ {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be
+   * well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow
+ * the native endian-ness. An explicit error message should be raised when
+ * attempting to export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void* manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+#ifdef __cplusplus
+} // DLPACK_EXTERN_C
+#endif
+#endif // DLPACK_DLPACK_H_
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 47d500ca77..35eb280737 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -4,6 +4,7 @@
 #include <simtbx/nanoBragg/nanoBragg.h>
 #include <iostream>
 #include <boost/python/numpy.hpp>
+#include <dlpack/dlpack.h>
 
 using namespace boost::python;
 namespace simtbx{
@@ -512,6 +513,203 @@ namespace boost_python { namespace {
     Kokkos::initialize(Kokkos::InitializationSettings()
                            .set_device_id(dev));
   }
+
+  void PrintDLTensorParameters( PyObject* capsule ) {
+    auto tensor = static_cast<DLTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
+    if (tensor == nullptr) {
+        std::cerr << "The input DLTensor is null." << std::endl;
+        return;
+    }
+
+    // Print the number of dimensions.
+    std::cout << "Number of dimensions (ndim): " << tensor->ndim << std::endl;
+
+    // Print the shape of the tensor.
+    std::cout << "Shape: [";
+    for (int i = 0; i < tensor->ndim; ++i) {
+        std::cout << tensor->shape[i];
+        if (i < tensor->ndim - 1) {
+            std::cout << ", ";
+        }
+    }
+    std::cout << "]" << std::endl;
+
+    // Print the data type of the tensor.
+    std::cout << "Data type (dtype): ";
+    switch (tensor->dtype.code) {
+        case kDLInt: std::cout << "Int"; break;
+        case kDLUInt: std::cout << "UInt"; break;
+        case kDLFloat: std::cout << "Float"; break;
+        case kDLBfloat: std::cout << "BFloat"; break;
+        default: std::cout << "Unknown"; break;
+    }
+    std::cout << " (" << int(tensor->dtype.bits) << " bits, " << tensor->dtype.lanes << " lanes)" << std::endl;
+
+    // Print the device context (device type and device id).
+    std::cout << "Device context: ";
+    switch (tensor->device.device_type) {
+        case kDLCPU: std::cout << "CPU"; break;
+        case kDLCUDA: std::cout << "CUDA"; break;
+        case kDLCUDAHost: std::cout << "CUDA Host"; break;
+        case kDLOpenCL: std::cout << "OpenCL"; break;
+        case kDLVulkan: std::cout << "Vulkan"; break;
+        case kDLMetal: std::cout << "Metal"; break;
+        case kDLVPI: std::cout << "VPI"; break;
+        case kDLROCM: std::cout << "ROCM"; break;
+        default: std::cout << "Unknown"; break;
+    }
+    std::cout << ", Device ID: " << tensor->device.device_id << std::endl;
+
+    // Print the strides of the tensor, if available.
+    if (tensor->strides != nullptr) {
+        std::cout << "Strides: [";
+        for (int i = 0; i < tensor->ndim; ++i) {
+            std::cout << tensor->strides[i];
+            if (i < tensor->ndim - 1) {
+                std::cout << ", ";
+            }
+        }
+        std::cout << "]" << std::endl;
+    } else {
+        std::cout << "Strides: [Contiguous in memory]" << std::endl;
+    }
+
+    // Print the byte offset of the tensor data.
+    std::cout << "Byte Offset: " << tensor->byte_offset << std::endl;
+}
+
+  template <typename ViewType>
+  class KokkosViewToDLPack {
+  public:
+    KokkosViewToDLPack(ViewType view) : view_(view) {}
+
+    torch::Tensor convertToDLPack() {
+      // Convert the Kokkos view to DLPack
+      DLManagedTensor* dlpackTensor = convertToDLPack();
+
+      // Convert the DLPack tensor to PyTorch
+      torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
+
+      // Free the DLPack tensor memory
+      delete[] dlpackTensor->dl_tensor.shape;
+      delete dlpackTensor;
+
+      return tensor;
+    }
+    
+  private:
+    ViewType view_;
+
+    DLManagedTensor* convertToDLPack() {
+      // Get the Kokkos view size and dimensions
+      size_t numDims = ViewType::rank;
+      size_t* shape = new size_t[numDims];
+      for (size_t i = 0; i < numDims; i++) {
+        shape[i] = view_.extent(i);
+      }
+
+      // Create a DLPack tensor
+      DLManagedTensor* dlpackTensor = new DLManagedTensor;
+      dlpackTensor->dl_tensor.data = view_.data();
+      dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
+      dlpackTensor->dl_tensor.ndim = numDims;
+      dlpackTensor->dl_tensor.dtype = getDLPackDataType();
+      dlpackTensor->dl_tensor.shape = shape;
+      dlpackTensor->dl_tensor.strides = nullptr;
+      dlpackTensor->dl_tensor.byte_offset = 0;
+      dlpackTensor->manager_ctx = nullptr;
+      dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
+
+      return dlpackTensor;
+    }
+
+    DLDataType getDLPackDataType() {
+      DLDataType dtype;
+      dtype.code = getDLPackTypeCode();
+      dtype.bits = sizeof(typename ViewType::value_type) * 8;
+      dtype.lanes = 1;
+      return dtype;
+    }
+
+    DLDataTypeCode getDLPackTypeCode() {
+      using ValueType = typename ViewType::value_type;
+      if (std::is_same<ValueType, float>::value) {
+        return kDLFloat;
+      } else if (std::is_same<ValueType, double>::value) {
+        return kDLFloat;
+      } else if (std::is_same<ValueType, int>::value) {
+        return kDLInt;
+      } else if (std::is_same<ValueType, unsigned int>::value) {
+        return kDLUInt;
+      } else if (std::is_same<ValueType, bool>::value) {
+        return kDLBool;
+      } else {
+        // Unsupported data type
+        throw std::runtime_error("Unsupported data type for DLPack conversion");
+      }
+    }
+  };
+
+
+struct DLPackAPI {
+
+  double container[50];
+
+  PyObject* dlpack() {
+    // Get the Kokkos view size and dimensions
+    size_t numDims = 1;
+    int64_t* shape = new int64_t[numDims];
+    for (size_t i = 0; i < numDims; i++) {
+      shape[i] = 50;
+    }
+
+    // Create a DLPack tensor
+    DLManagedTensor* dlpackTensor = new DLManagedTensor;
+    dlpackTensor->dl_tensor.data = static_cast<void*>(&container);
+    dlpackTensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
+    dlpackTensor->dl_tensor.device.device_id = 0;
+    dlpackTensor->dl_tensor.ndim = numDims;    
+    dlpackTensor->dl_tensor.dtype = getDLPackDataType();
+    dlpackTensor->dl_tensor.shape = shape;
+    dlpackTensor->dl_tensor.strides = nullptr;
+    dlpackTensor->dl_tensor.byte_offset = 0;
+    dlpackTensor->manager_ctx = nullptr;
+    dlpackTensor->deleter = [](DLManagedTensor* tensor) {
+        std::cout << "Blob" << std::endl;
+        delete[] tensor->dl_tensor.shape;
+    };
+
+    // Create a PyCapsule with the DLPack tensor
+    PyObject* capsule = PyCapsule_New(dlpackTensor, "dltensor", nullptr);
+
+    return capsule;
+  }
+
+  DLDataType getDLPackDataType() {
+    DLDataType dtype;
+    dtype.code = kDLFloat;
+    dtype.bits = sizeof(double) * 8;
+    dtype.lanes = 1;
+    return dtype;
+  }  
+
+  void print_hello() {
+    std::cout << "Hello Python!" << std::endl;
+  }
+
+  void print_container() {
+    std::cout << "C = [ ";
+    for (int i=0; i<50; i++) {
+      std::cout << container[i] << " ";
+    }
+    std::cout << "]" << std::endl;
+  }
+
+  boost::python::tuple dlpack_device() {
+    return boost::python::make_tuple(static_cast<int32_t>(DLDeviceType::kDLCPU), 0);
+  }
+
+  };
 #endif
 
   void diffBragg_init_module() {
@@ -529,8 +727,19 @@ namespace boost_python { namespace {
 
     def("initialize_kokkos", initialize_kokkos,
         "the sole argument `dev` (an int from 0 to Ngpu-1) is passed to Kokkos::initialize()");
+
+    def("print_dlpack",PrintDLTensorParameters,"Print information about a dlpack");
+
+    class_<DLPackAPI>("KokkosViewToDLPack", no_init)
+      .def(init("DLPack init"))
+      .def("__dlpack__", &DLPackAPI::dlpack, "Part of DLPack API")
+      .def("__dlpack_device__", &DLPackAPI::dlpack_device, "Part of DLPack API")
+      .def("hello", &DLPackAPI::print_hello, "Dummy test function")
+      .def("print", &DLPackAPI::print_container, "Print container")
+    ;
 #endif
 
+
     class_<simtbx::nanoBragg::diffBragg, bases<simtbx::nanoBragg::nanoBragg> >
             ("diffBragg", no_init)
       /* constructor that takes a dxtbx detector and beam model */

From ffc0ce1987f9a19dc0fae13c1bed2351d6d7b81c Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Mon, 12 Feb 2024 15:19:11 -0800
Subject: [PATCH 02/17] DLPack interface trial for diffBragg

---
 kokkostbx/kokkos_dlpack.h                | 167 +++++++++++++++++++++++
 simtbx/diffBragg/src/diffBragg.cpp       |  10 ++
 simtbx/diffBragg/src/diffBragg.h         |   4 +-
 simtbx/diffBragg/src/diffBraggKOKKOS.cpp |   4 +
 simtbx/diffBragg/src/diffBraggKOKKOS.h   |   3 +
 simtbx/diffBragg/src/diffBragg_ext.cpp   | 137 ++++++++++---------
 6 files changed, 257 insertions(+), 68 deletions(-)
 create mode 100644 kokkostbx/kokkos_dlpack.h

diff --git a/kokkostbx/kokkos_dlpack.h b/kokkostbx/kokkos_dlpack.h
new file mode 100644
index 0000000000..1f47a31220
--- /dev/null
+++ b/kokkostbx/kokkos_dlpack.h
@@ -0,0 +1,167 @@
+#ifndef KOKKOS_DLPACK_H
+#define KOKKOS_DLPACK_H
+#include <Kokkos_Core.hpp>
+#include <dlpack/dlpack.h>
+
+namespace kokkostbx {
+
+template<typename DataType, typename SpaceType>
+DLDataTypeCode getDLPackTypeCode() {
+  using ValueType = typename Kokkos::View<DataType, SpaceType>::value_type;
+  if (std::is_same<ValueType, float>::value) {
+    return kDLFloat;
+  } else if (std::is_same<ValueType, double>::value) {
+    return kDLFloat;
+  } else if (std::is_same<ValueType, int>::value) {
+    return kDLInt;
+  } else if (std::is_same<ValueType, unsigned int>::value) {
+    return kDLUInt;
+  // } else if (std::is_same<ValueType, bool>::value) {
+    // return kDLBool;
+  } else {
+    // Unsupported data type
+    throw std::runtime_error("Unsupported data type for DLPack conversion");
+  }
+}
+
+template<typename DataType, typename SpaceType>
+DLDataType getDLPackDataType() {
+  DLDataType dtype;
+  dtype.code = getDLPackTypeCode<DataType, SpaceType>();
+  dtype.bits = sizeof(typename Kokkos::View<DataType, SpaceType>::value_type) * 8;
+  dtype.lanes = 1;
+  return dtype;
+}
+
+template<typename SpaceType>
+DLDevice getDLPackDevice() {
+  DLDevice dl_device;
+  if (std::is_same<SpaceType, Kokkos::HostSpace>::value) {
+      dl_device = {kDLCPU, 0};
+  }
+#ifdef KOKKOS_ENABLE_CUDA
+  else if (std::is_same<SpaceType, Kokkos::CudaSpace>::value) {
+      dl_device = {kDLCUDA, 0};
+  } else if (std::is_same<SpaceType, Kokkos::CudaUVMSpace>::value) {
+      dl_device = {kDLCUDAManaged, 0};
+  } else if (std::is_same<SpaceType, Kokkos::CudaHostPinnedSpace>::value) {
+      dl_device = {kDLCUDAHost, 0};
+  }
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  else if (std::is_same<SpaceType, Kokkos::HIPSpace>::value) {
+      dl_device = {kDLROCM, 0};
+  } else if (std::is_same<SpaceType, Kokkos::HIPHostPinnedSpace>::value) {
+      dl_device = {kDLROCMHost, 0};
+  } 
+#endif
+  else {
+      // Extend to other device types as needed
+      throw std::runtime_error("Unsupported Kokkos device type for DLPack conversion.");
+  }
+  return dl_device;
+}
+
+template<typename DataType, typename SpaceType>
+DLManagedTensor* view_to_dlpack(Kokkos::View<DataType, SpaceType>& view) {
+  // Get the Kokkos view size and dimensions
+  constexpr size_t numDims = Kokkos::View<DataType, SpaceType>::rank;
+  int64_t* shape = new int64_t[numDims];
+  for (size_t i = 0; i < numDims; i++) {
+    shape[i] = view.extent(i);
+  }
+
+  // Create a DLPack tensor
+  DLManagedTensor* dlpackTensor = new DLManagedTensor;
+  dlpackTensor->dl_tensor.data = view.data();
+  // dlpackTensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
+  // dlpackTensor->dl_tensor.device.device_id = 0;
+  dlpackTensor->dl_tensor.device = getDLPackDevice<SpaceType>();
+  dlpackTensor->dl_tensor.ndim = numDims;    
+  dlpackTensor->dl_tensor.dtype = getDLPackDataType<DataType, SpaceType>();
+  dlpackTensor->dl_tensor.shape = shape;
+  dlpackTensor->dl_tensor.strides = nullptr;
+  dlpackTensor->dl_tensor.byte_offset = 0;
+  dlpackTensor->manager_ctx = nullptr;
+  dlpackTensor->deleter = [](DLManagedTensor* tensor) {
+      std::cout << "Blob" << std::endl;
+      delete[] tensor->dl_tensor.shape;
+  };
+  return dlpackTensor;
+}
+
+// template <typename ViewType>
+// class KokkosViewToDLPack {
+// public:
+//   KokkosViewToDLPack(ViewType view) : view_(view) {}
+
+//   torch::Tensor convertToDLPack() {
+//     // Convert the Kokkos view to DLPack
+//     DLManagedTensor* dlpackTensor = convertToDLPack();
+
+//     // Convert the DLPack tensor to PyTorch
+//     torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
+
+//     // Free the DLPack tensor memory
+//     delete[] dlpackTensor->dl_tensor.shape;
+//     delete dlpackTensor;
+
+//     return tensor;
+//   }
+  
+// private:
+//   ViewType view_;
+
+//   DLManagedTensor* convertToDLPack() {
+//     // Get the Kokkos view size and dimensions
+//     size_t numDims = ViewType::rank;
+//     size_t* shape = new size_t[numDims];
+//     for (size_t i = 0; i < numDims; i++) {
+//       shape[i] = view_.extent(i);
+//     }
+
+//     // Create a DLPack tensor
+//     DLManagedTensor* dlpackTensor = new DLManagedTensor;
+//     dlpackTensor->dl_tensor.data = view_.data();
+//     dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
+//     dlpackTensor->dl_tensor.ndim = numDims;
+//     dlpackTensor->dl_tensor.dtype = getDLPackDataType();
+//     dlpackTensor->dl_tensor.shape = shape;
+//     dlpackTensor->dl_tensor.strides = nullptr;
+//     dlpackTensor->dl_tensor.byte_offset = 0;
+//     dlpackTensor->manager_ctx = nullptr;
+//     dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
+
+//     return dlpackTensor;
+//   }
+
+//   DLDataType getDLPackDataType() {
+//     DLDataType dtype;
+//     dtype.code = getDLPackTypeCode();
+//     dtype.bits = sizeof(typename ViewType::value_type) * 8;
+//     dtype.lanes = 1;
+//     return dtype;
+//   }
+
+//   DLDataTypeCode getDLPackTypeCode() {
+//     using ValueType = typename ViewType::value_type;
+//     if (std::is_same<ValueType, float>::value) {
+//       return kDLFloat;
+//     } else if (std::is_same<ValueType, double>::value) {
+//       return kDLFloat;
+//     } else if (std::is_same<ValueType, int>::value) {
+//       return kDLInt;
+//     } else if (std::is_same<ValueType, unsigned int>::value) {
+//       return kDLUInt;
+//     } else if (std::is_same<ValueType, bool>::value) {
+//       return kDLBool;
+//     } else {
+//       // Unsupported data type
+//       throw std::runtime_error("Unsupported data type for DLPack conversion");
+//     }
+//   }
+// };
+
+}
+
+#endif  // KOKKOS_DLPACK_H
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index a42cc5b960..51988d2bdc 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1507,6 +1507,16 @@ boost::python::tuple diffBragg::get_ncells_derivative_pixels(){
     return derivative_pixels;
 }
 
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+PyObject* diffBragg::get_d_Ncells_images() {
+
+    if (diffBragg_runner == nullptr) {
+        return nullptr;
+    }
+    return PyCapsule_New(diffBragg_runner->get_d_Ncells_images(), "dltensor", nullptr);   
+}
+#endif
+
 boost::python::tuple diffBragg::get_diffuse_gamma_derivative_pixels(){
     SCITBX_ASSERT(db_flags.refine_diffuse);
     int Npix_total = first_deriv_imgs.diffuse_gamma.size() / 3;
diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h
index c26d0d019e..c27567ca03 100644
--- a/simtbx/diffBragg/src/diffBragg.h
+++ b/simtbx/diffBragg/src/diffBragg.h
@@ -170,7 +170,6 @@ class diffBragg: public nanoBragg{
     inline void kokkos_free() { diffBragg_runner.reset(); }
     // allocate when needed to avoid problems with kokkos initialization when cuda/kokkos isn't used
     std::shared_ptr<diffBraggKOKKOS> diffBragg_runner{};
-    // diffBraggKOKKOS diffBragg_runner;
 #endif
 
     inline void gpu_free(){
@@ -238,6 +237,9 @@ class diffBragg: public nanoBragg{
   af::flex_double get_raw_pixels_roi();
   boost::python::tuple get_fp_fdp_derivative_pixels();
   boost::python::tuple get_ncells_derivative_pixels();
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+  PyObject* get_d_Ncells_images();
+#endif
   boost::python::tuple get_diffuse_gamma_derivative_pixels();
   boost::python::tuple get_diffuse_sigma_derivative_pixels();
   boost::python::tuple get_ncells_def_derivative_pixels();
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
index 273088f765..3420469ed8 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
@@ -43,6 +43,10 @@ uint32_t combine_refinement_flags(flags& db_flags) {
     return refine_flag;
 }
 
+DLManagedTensor* diffBraggKOKKOS::get_d_Ncells_images() {
+    return kokkostbx::view_to_dlpack(m_d_Ncells_images);
+}
+
 void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     int Npix_to_model,
     std::vector<unsigned int>& panels_fasts_slows,
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.h b/simtbx/diffBragg/src/diffBraggKOKKOS.h
index 3aa07ae27b..a88ef5d4d3 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.h
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.h
@@ -7,6 +7,7 @@
 
 #include "kokkostbx/kokkos_types.h"
 #include "kokkostbx/kokkos_utils.h"
+#include "kokkostbx/kokkos_dlpack.h"
 #include "simtbx/diffBragg/src/util.h"
 #include "simtbx/diffBragg/src/util_kokkos.h"
 #include "simtbx/diffBragg/src/diffBragg_refine_flag.h"
@@ -147,6 +148,8 @@ class diffBraggKOKKOS {
         cuda_flags& db_cu_flags,
         // diffBragg_kokkosPointers& kp,
         timer_variables& TIMERS);
+
+    DLManagedTensor* get_d_Ncells_images();
 };
 
 #endif
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 35eb280737..43906ce078 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -578,77 +578,77 @@ namespace boost_python { namespace {
     std::cout << "Byte Offset: " << tensor->byte_offset << std::endl;
 }
 
-  template <typename ViewType>
-  class KokkosViewToDLPack {
-  public:
-    KokkosViewToDLPack(ViewType view) : view_(view) {}
+  // template <typename ViewType>
+  // class KokkosViewToDLPack {
+  // public:
+  //   KokkosViewToDLPack(ViewType view) : view_(view) {}
 
-    torch::Tensor convertToDLPack() {
-      // Convert the Kokkos view to DLPack
-      DLManagedTensor* dlpackTensor = convertToDLPack();
+  //   torch::Tensor convertToDLPack() {
+  //     // Convert the Kokkos view to DLPack
+  //     DLManagedTensor* dlpackTensor = convertToDLPack();
 
-      // Convert the DLPack tensor to PyTorch
-      torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
+  //     // Convert the DLPack tensor to PyTorch
+  //     torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
 
-      // Free the DLPack tensor memory
-      delete[] dlpackTensor->dl_tensor.shape;
-      delete dlpackTensor;
+  //     // Free the DLPack tensor memory
+  //     delete[] dlpackTensor->dl_tensor.shape;
+  //     delete dlpackTensor;
 
-      return tensor;
-    }
+  //     return tensor;
+  //   }
     
-  private:
-    ViewType view_;
-
-    DLManagedTensor* convertToDLPack() {
-      // Get the Kokkos view size and dimensions
-      size_t numDims = ViewType::rank;
-      size_t* shape = new size_t[numDims];
-      for (size_t i = 0; i < numDims; i++) {
-        shape[i] = view_.extent(i);
-      }
-
-      // Create a DLPack tensor
-      DLManagedTensor* dlpackTensor = new DLManagedTensor;
-      dlpackTensor->dl_tensor.data = view_.data();
-      dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
-      dlpackTensor->dl_tensor.ndim = numDims;
-      dlpackTensor->dl_tensor.dtype = getDLPackDataType();
-      dlpackTensor->dl_tensor.shape = shape;
-      dlpackTensor->dl_tensor.strides = nullptr;
-      dlpackTensor->dl_tensor.byte_offset = 0;
-      dlpackTensor->manager_ctx = nullptr;
-      dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
-
-      return dlpackTensor;
-    }
-
-    DLDataType getDLPackDataType() {
-      DLDataType dtype;
-      dtype.code = getDLPackTypeCode();
-      dtype.bits = sizeof(typename ViewType::value_type) * 8;
-      dtype.lanes = 1;
-      return dtype;
-    }
-
-    DLDataTypeCode getDLPackTypeCode() {
-      using ValueType = typename ViewType::value_type;
-      if (std::is_same<ValueType, float>::value) {
-        return kDLFloat;
-      } else if (std::is_same<ValueType, double>::value) {
-        return kDLFloat;
-      } else if (std::is_same<ValueType, int>::value) {
-        return kDLInt;
-      } else if (std::is_same<ValueType, unsigned int>::value) {
-        return kDLUInt;
-      } else if (std::is_same<ValueType, bool>::value) {
-        return kDLBool;
-      } else {
-        // Unsupported data type
-        throw std::runtime_error("Unsupported data type for DLPack conversion");
-      }
-    }
-  };
+  // private:
+  //   ViewType view_;
+
+  //   DLManagedTensor* convertToDLPack() {
+  //     // Get the Kokkos view size and dimensions
+  //     size_t numDims = ViewType::rank;
+  //     size_t* shape = new size_t[numDims];
+  //     for (size_t i = 0; i < numDims; i++) {
+  //       shape[i] = view_.extent(i);
+  //     }
+
+  //     // Create a DLPack tensor
+  //     DLManagedTensor* dlpackTensor = new DLManagedTensor;
+  //     dlpackTensor->dl_tensor.data = view_.data();
+  //     dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
+  //     dlpackTensor->dl_tensor.ndim = numDims;
+  //     dlpackTensor->dl_tensor.dtype = getDLPackDataType();
+  //     dlpackTensor->dl_tensor.shape = shape;
+  //     dlpackTensor->dl_tensor.strides = nullptr;
+  //     dlpackTensor->dl_tensor.byte_offset = 0;
+  //     dlpackTensor->manager_ctx = nullptr;
+  //     dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
+
+  //     return dlpackTensor;
+  //   }
+
+  //   DLDataType getDLPackDataType() {
+  //     DLDataType dtype;
+  //     dtype.code = getDLPackTypeCode();
+  //     dtype.bits = sizeof(typename ViewType::value_type) * 8;
+  //     dtype.lanes = 1;
+  //     return dtype;
+  //   }
+
+  //   DLDataTypeCode getDLPackTypeCode() {
+  //     using ValueType = typename ViewType::value_type;
+  //     if (std::is_same<ValueType, float>::value) {
+  //       return kDLFloat;
+  //     } else if (std::is_same<ValueType, double>::value) {
+  //       return kDLFloat;
+  //     } else if (std::is_same<ValueType, int>::value) {
+  //       return kDLInt;
+  //     } else if (std::is_same<ValueType, unsigned int>::value) {
+  //       return kDLUInt;
+  //     } else if (std::is_same<ValueType, bool>::value) {
+  //       return kDLBool;
+  //     } else {
+  //       // Unsupported data type
+  //       throw std::runtime_error("Unsupported data type for DLPack conversion");
+  //     }
+  //   }
+  // };
 
 
 struct DLPackAPI {
@@ -710,6 +710,7 @@ struct DLPackAPI {
   }
 
   };
+
 #endif
 
   void diffBragg_init_module() {
@@ -730,6 +731,8 @@ struct DLPackAPI {
 
     def("print_dlpack",PrintDLTensorParameters,"Print information about a dlpack");
 
+    // def("get_d_Ncells_images", &get_dlpack, "Return DLPackTensor for d_Ncells_images; pot. on GPU")
+
     class_<DLPackAPI>("KokkosViewToDLPack", no_init)
       .def(init("DLPack init"))
       .def("__dlpack__", &DLPackAPI::dlpack, "Part of DLPack API")
@@ -787,7 +790,7 @@ struct DLPackAPI {
       .def("set_ncells_values", &simtbx::nanoBragg::diffBragg::set_ncells_values, "set Ncells values as a 3-tuple (Na, Nb, Nc)")
 
       .def("get_ncells_values", &simtbx::nanoBragg::diffBragg::get_ncells_values, "get Ncells values as a 3-tuple (Na, Nb, Nc)")
-
+      .def("get_d_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d_Ncells_images, "get DLPackTensor for d_Ncells_images; pot. on GPU")
 
       .def("add_diffBragg_spots_full", &simtbx::nanoBragg::diffBragg::add_diffBragg_spots_full, "forward model and gradients at every pixel")
 

From 59f5d5d78e9d629e0fdfa1d7ebb7ffb9c685b0e6 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Wed, 14 Feb 2024 11:19:27 -0800
Subject: [PATCH 03/17] Fix missing destructor

---
 simtbx/diffBragg/src/diffBragg_ext.cpp | 87 +++++---------------------
 1 file changed, 15 insertions(+), 72 deletions(-)

diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 43906ce078..1245730aa7 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -578,78 +578,21 @@ namespace boost_python { namespace {
     std::cout << "Byte Offset: " << tensor->byte_offset << std::endl;
 }
 
-  // template <typename ViewType>
-  // class KokkosViewToDLPack {
-  // public:
-  //   KokkosViewToDLPack(ViewType view) : view_(view) {}
-
-  //   torch::Tensor convertToDLPack() {
-  //     // Convert the Kokkos view to DLPack
-  //     DLManagedTensor* dlpackTensor = convertToDLPack();
-
-  //     // Convert the DLPack tensor to PyTorch
-  //     torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
-
-  //     // Free the DLPack tensor memory
-  //     delete[] dlpackTensor->dl_tensor.shape;
-  //     delete dlpackTensor;
-
-  //     return tensor;
-  //   }
-    
-  // private:
-  //   ViewType view_;
-
-  //   DLManagedTensor* convertToDLPack() {
-  //     // Get the Kokkos view size and dimensions
-  //     size_t numDims = ViewType::rank;
-  //     size_t* shape = new size_t[numDims];
-  //     for (size_t i = 0; i < numDims; i++) {
-  //       shape[i] = view_.extent(i);
-  //     }
-
-  //     // Create a DLPack tensor
-  //     DLManagedTensor* dlpackTensor = new DLManagedTensor;
-  //     dlpackTensor->dl_tensor.data = view_.data();
-  //     dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
-  //     dlpackTensor->dl_tensor.ndim = numDims;
-  //     dlpackTensor->dl_tensor.dtype = getDLPackDataType();
-  //     dlpackTensor->dl_tensor.shape = shape;
-  //     dlpackTensor->dl_tensor.strides = nullptr;
-  //     dlpackTensor->dl_tensor.byte_offset = 0;
-  //     dlpackTensor->manager_ctx = nullptr;
-  //     dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
-
-  //     return dlpackTensor;
-  //   }
-
-  //   DLDataType getDLPackDataType() {
-  //     DLDataType dtype;
-  //     dtype.code = getDLPackTypeCode();
-  //     dtype.bits = sizeof(typename ViewType::value_type) * 8;
-  //     dtype.lanes = 1;
-  //     return dtype;
-  //   }
-
-  //   DLDataTypeCode getDLPackTypeCode() {
-  //     using ValueType = typename ViewType::value_type;
-  //     if (std::is_same<ValueType, float>::value) {
-  //       return kDLFloat;
-  //     } else if (std::is_same<ValueType, double>::value) {
-  //       return kDLFloat;
-  //     } else if (std::is_same<ValueType, int>::value) {
-  //       return kDLInt;
-  //     } else if (std::is_same<ValueType, unsigned int>::value) {
-  //       return kDLUInt;
-  //     } else if (std::is_same<ValueType, bool>::value) {
-  //       return kDLBool;
-  //     } else {
-  //       // Unsupported data type
-  //       throw std::runtime_error("Unsupported data type for DLPack conversion");
-  //     }
-  //   }
-  // };
+void dlpack_destructor(PyObject* capsule) {
+  if (!PyCapsule_IsValid(capsule, "dltensor")) {
+    std::cout << "Muh0 " << PyCapsule_GetPointer(capsule, "used_dltensor") << std::endl;
+    return;
+  }
 
+  // If the capsule has not been used, we need to delete it
+  std::cout << "Muh1" << std::endl;
+  DLManagedTensor* dlpackTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
+  std::cout << "Muh2" << std::endl;
+  dlpackTensor->deleter(dlpackTensor);
+  std::cout << "Muh3" << std::endl;
+  delete dlpackTensor;
+  std::cout << "Muh4" << std::endl;
+}
 
 struct DLPackAPI {
 
@@ -680,7 +623,7 @@ struct DLPackAPI {
     };
 
     // Create a PyCapsule with the DLPack tensor
-    PyObject* capsule = PyCapsule_New(dlpackTensor, "dltensor", nullptr);
+    PyObject* capsule = PyCapsule_New(dlpackTensor, "dltensor", dlpack_destructor);
 
     return capsule;
   }

From e1085661de1af65d07652fed0eb64373d9dadc99 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Thu, 15 Feb 2024 10:21:49 -0800
Subject: [PATCH 04/17] Fix device id

---
 kokkostbx/kokkos_dlpack.h          | 91 +++---------------------------
 simtbx/diffBragg/src/diffBragg.cpp | 28 ++++++++-
 2 files changed, 35 insertions(+), 84 deletions(-)

diff --git a/kokkostbx/kokkos_dlpack.h b/kokkostbx/kokkos_dlpack.h
index 1f47a31220..eb00e79fda 100644
--- a/kokkostbx/kokkos_dlpack.h
+++ b/kokkostbx/kokkos_dlpack.h
@@ -35,31 +35,31 @@ DLDataType getDLPackDataType() {
 
 template<typename SpaceType>
 DLDevice getDLPackDevice() {
-  DLDevice dl_device;
+  const int device_id = std::max(0, Kokkos::device_id()); // convert host id from -1 to 0
+
   if (std::is_same<SpaceType, Kokkos::HostSpace>::value) {
-      dl_device = {kDLCPU, 0};
+      return {kDLCPU, device_id};
   }
 #ifdef KOKKOS_ENABLE_CUDA
   else if (std::is_same<SpaceType, Kokkos::CudaSpace>::value) {
-      dl_device = {kDLCUDA, 0};
+      return {kDLCUDA, device_id};
   } else if (std::is_same<SpaceType, Kokkos::CudaUVMSpace>::value) {
-      dl_device = {kDLCUDAManaged, 0};
+      return {kDLCUDAManaged, device_id};
   } else if (std::is_same<SpaceType, Kokkos::CudaHostPinnedSpace>::value) {
-      dl_device = {kDLCUDAHost, 0};
+      return {kDLCUDAHost, device_id};
   }
 #endif
 #ifdef KOKKOS_ENABLE_HIP
   else if (std::is_same<SpaceType, Kokkos::HIPSpace>::value) {
-      dl_device = {kDLROCM, 0};
+      return {kDLROCM, device_id};
   } else if (std::is_same<SpaceType, Kokkos::HIPHostPinnedSpace>::value) {
-      dl_device = {kDLROCMHost, 0};
+      return {kDLROCMHost, device_id};
   } 
 #endif
   else {
       // Extend to other device types as needed
       throw std::runtime_error("Unsupported Kokkos device type for DLPack conversion.");
   }
-  return dl_device;
 }
 
 template<typename DataType, typename SpaceType>
@@ -74,8 +74,6 @@ DLManagedTensor* view_to_dlpack(Kokkos::View<DataType, SpaceType>& view) {
   // Create a DLPack tensor
   DLManagedTensor* dlpackTensor = new DLManagedTensor;
   dlpackTensor->dl_tensor.data = view.data();
-  // dlpackTensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
-  // dlpackTensor->dl_tensor.device.device_id = 0;
   dlpackTensor->dl_tensor.device = getDLPackDevice<SpaceType>();
   dlpackTensor->dl_tensor.ndim = numDims;    
   dlpackTensor->dl_tensor.dtype = getDLPackDataType<DataType, SpaceType>();
@@ -84,84 +82,11 @@ DLManagedTensor* view_to_dlpack(Kokkos::View<DataType, SpaceType>& view) {
   dlpackTensor->dl_tensor.byte_offset = 0;
   dlpackTensor->manager_ctx = nullptr;
   dlpackTensor->deleter = [](DLManagedTensor* tensor) {
-      std::cout << "Blob" << std::endl;
       delete[] tensor->dl_tensor.shape;
   };
   return dlpackTensor;
 }
 
-// template <typename ViewType>
-// class KokkosViewToDLPack {
-// public:
-//   KokkosViewToDLPack(ViewType view) : view_(view) {}
-
-//   torch::Tensor convertToDLPack() {
-//     // Convert the Kokkos view to DLPack
-//     DLManagedTensor* dlpackTensor = convertToDLPack();
-
-//     // Convert the DLPack tensor to PyTorch
-//     torch::Tensor tensor = torch::from_dlpack(dlpackTensor);
-
-//     // Free the DLPack tensor memory
-//     delete[] dlpackTensor->dl_tensor.shape;
-//     delete dlpackTensor;
-
-//     return tensor;
-//   }
-  
-// private:
-//   ViewType view_;
-
-//   DLManagedTensor* convertToDLPack() {
-//     // Get the Kokkos view size and dimensions
-//     size_t numDims = ViewType::rank;
-//     size_t* shape = new size_t[numDims];
-//     for (size_t i = 0; i < numDims; i++) {
-//       shape[i] = view_.extent(i);
-//     }
-
-//     // Create a DLPack tensor
-//     DLManagedTensor* dlpackTensor = new DLManagedTensor;
-//     dlpackTensor->dl_tensor.data = view_.data();
-//     dlpackTensor->dl_tensor.ctx = const_cast<void*>(view_.impl_map().template device_data<void>());
-//     dlpackTensor->dl_tensor.ndim = numDims;
-//     dlpackTensor->dl_tensor.dtype = getDLPackDataType();
-//     dlpackTensor->dl_tensor.shape = shape;
-//     dlpackTensor->dl_tensor.strides = nullptr;
-//     dlpackTensor->dl_tensor.byte_offset = 0;
-//     dlpackTensor->manager_ctx = nullptr;
-//     dlpackTensor->deleter = [](DLManagedTensor* tensor) { delete[] tensor->dl_tensor.shape; };
-
-//     return dlpackTensor;
-//   }
-
-//   DLDataType getDLPackDataType() {
-//     DLDataType dtype;
-//     dtype.code = getDLPackTypeCode();
-//     dtype.bits = sizeof(typename ViewType::value_type) * 8;
-//     dtype.lanes = 1;
-//     return dtype;
-//   }
-
-//   DLDataTypeCode getDLPackTypeCode() {
-//     using ValueType = typename ViewType::value_type;
-//     if (std::is_same<ValueType, float>::value) {
-//       return kDLFloat;
-//     } else if (std::is_same<ValueType, double>::value) {
-//       return kDLFloat;
-//     } else if (std::is_same<ValueType, int>::value) {
-//       return kDLInt;
-//     } else if (std::is_same<ValueType, unsigned int>::value) {
-//       return kDLUInt;
-//     } else if (std::is_same<ValueType, bool>::value) {
-//       return kDLBool;
-//     } else {
-//       // Unsupported data type
-//       throw std::runtime_error("Unsupported data type for DLPack conversion");
-//     }
-//   }
-// };
-
 }
 
 #endif  // KOKKOS_DLPACK_H
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 51988d2bdc..3913481057 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1508,12 +1508,38 @@ boost::python::tuple diffBragg::get_ncells_derivative_pixels(){
 }
 
 #ifdef DIFFBRAGG_HAVE_KOKKOS
+void dlpack_destructor(PyObject* capsule) {
+  if (!PyCapsule_IsValid(capsule, "dltensor")) {
+    return;
+  }
+
+  // If the capsule has not been used, we need to delete it
+  DLManagedTensor* dlpackTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
+  dlpackTensor->deleter(dlpackTensor);
+  delete dlpackTensor;
+}
+
+// template <typename function>
+// struct DLPackAPI {
+//   PyObject* dlpack() {
+//     if (diffBragg::diffBragg_runner == nullptr) {
+//         return nullptr;
+//     }
+//     return PyCapsule_New(function(), "dltensor", dlpack_destructor); 
+//   }
+
+//   boost::python::tuple dlpack_device() {
+//     auto device = kokkostbx::getDLPackDevice<MemSpace>();
+//     return boost::python::make_tuple(static_cast<int32_t>(device.device_type), device.device_id);
+//   }
+// };
+
 PyObject* diffBragg::get_d_Ncells_images() {
 
     if (diffBragg_runner == nullptr) {
         return nullptr;
     }
-    return PyCapsule_New(diffBragg_runner->get_d_Ncells_images(), "dltensor", nullptr);   
+    return PyCapsule_New(diffBragg_runner->get_d_Ncells_images(), "dltensor", dlpack_destructor);   
 }
 #endif
 

From 8a148357781a8e63cec6fa8ccc3919943be3e555 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Thu, 15 Feb 2024 16:37:46 -0800
Subject: [PATCH 05/17] Add DLPack API for diffBragg results

---
 simtbx/diffBragg/src/diffBragg.cpp       | 123 ++++++++++++++++++-----
 simtbx/diffBragg/src/diffBragg.h         |  22 ++++
 simtbx/diffBragg/src/diffBraggKOKKOS.cpp |  92 ++++++++++++++++-
 simtbx/diffBragg/src/diffBraggKOKKOS.h   |  21 ++++
 simtbx/diffBragg/src/diffBragg_ext.cpp   |  47 ++++++++-
 5 files changed, 276 insertions(+), 29 deletions(-)

diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 3913481057..4aa58e692f 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -7,6 +7,7 @@
 #include<unordered_map>
 #include <cctbx/sgtbx/reciprocal_space_asu.h>
 #include <boost/python/numpy.hpp>
+#include "diffBragg.h"
 
 namespace np=boost::python::numpy;
 
@@ -1509,37 +1510,111 @@ boost::python::tuple diffBragg::get_ncells_derivative_pixels(){
 
 #ifdef DIFFBRAGG_HAVE_KOKKOS
 void dlpack_destructor(PyObject* capsule) {
-  if (!PyCapsule_IsValid(capsule, "dltensor")) {
-    return;
-  }
+    if (!PyCapsule_IsValid(capsule, "dltensor")) {
+        return;
+    }
 
-  // If the capsule has not been used, we need to delete it
-  DLManagedTensor* dlpackTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
-  dlpackTensor->deleter(dlpackTensor);
-  delete dlpackTensor;
+    // If the capsule has not been used, we need to delete it
+    DLManagedTensor* dlpackTensor = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule, "dltensor"));
+    dlpackTensor->deleter(dlpackTensor);
+    delete dlpackTensor;
 }
 
-// template <typename function>
-// struct DLPackAPI {
-//   PyObject* dlpack() {
-//     if (diffBragg::diffBragg_runner == nullptr) {
-//         return nullptr;
-//     }
-//     return PyCapsule_New(function(), "dltensor", dlpack_destructor); 
-//   }
-
-//   boost::python::tuple dlpack_device() {
-//     auto device = kokkostbx::getDLPackDevice<MemSpace>();
-//     return boost::python::make_tuple(static_cast<int32_t>(device.device_type), device.device_id);
-//   }
-// };
-
-PyObject* diffBragg::get_d_Ncells_images() {
 
+// Fun with pointer-to-member-functions
+PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func)()) {
     if (diffBragg_runner == nullptr) {
         return nullptr;
     }
-    return PyCapsule_New(diffBragg_runner->get_d_Ncells_images(), "dltensor", dlpack_destructor);   
+    return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);   
+}
+
+PyObject* diffBragg::get_floatimage() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_floatimage);
+}
+
+PyObject* diffBragg::get_wavelenimage() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_wavelenimage);
+}
+
+PyObject* diffBragg::get_d_diffuse_gamma_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_gamma_images);
+}
+
+PyObject* diffBragg::get_d_diffuse_sigma_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_sigma_images);
+}
+
+PyObject* diffBragg::get_d_Umat_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images);
+}
+
+PyObject* diffBragg::get_d2_Umat_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Umat_images);
+}
+
+PyObject* diffBragg::get_d_Bmat_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images);
+}
+
+PyObject* diffBragg::get_d2_Bmat_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images);
+}
+
+PyObject* diffBragg::get_d_Ncells_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images);
+}
+
+PyObject* diffBragg::get_d2_Ncells_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Ncells_images);
+}
+
+PyObject* diffBragg::get_d_fcell_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images);
+}
+
+PyObject* diffBragg::get_d2_fcell_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_fcell_images);
+}
+
+PyObject* diffBragg::get_d_eta_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images);
+}
+
+PyObject* diffBragg::get_d2_eta_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_eta_images);
+}
+
+PyObject* diffBragg::get_d_lambda_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images);
+}
+
+PyObject* diffBragg::get_d2_lambda_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_lambda_images);
+}
+
+PyObject* diffBragg::get_d_panel_rot_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images);
+}
+
+PyObject* diffBragg::get_d2_panel_rot_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_rot_images);
+}
+
+PyObject* diffBragg::get_d_panel_orig_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images);
+}
+
+PyObject* diffBragg::get_d2_panel_orig_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_orig_images);
+}
+
+PyObject* diffBragg::get_d_fp_fdp_images() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
+}
+
+PyObject* diffBragg::get_Fhkl_scale_deriv() {
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv);
 }
 #endif
 
diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h
index c27567ca03..599e55a3d0 100644
--- a/simtbx/diffBragg/src/diffBragg.h
+++ b/simtbx/diffBragg/src/diffBragg.h
@@ -238,7 +238,29 @@ class diffBragg: public nanoBragg{
   boost::python::tuple get_fp_fdp_derivative_pixels();
   boost::python::tuple get_ncells_derivative_pixels();
 #ifdef DIFFBRAGG_HAVE_KOKKOS
+  PyObject* PyCapsule_Wrapper(DLManagedTensor* (diffBraggKOKKOS::*func)());
+  PyObject* get_floatimage();
+  PyObject* get_wavelenimage();
+  PyObject* get_d_diffuse_gamma_images();
+  PyObject* get_d_diffuse_sigma_images();
+  PyObject* get_d_Umat_images();
+  PyObject* get_d2_Umat_images();
+  PyObject* get_d_Bmat_images();
+  PyObject* get_d2_Bmat_images();
   PyObject* get_d_Ncells_images();
+  PyObject* get_d2_Ncells_images();
+  PyObject* get_d_fcell_images();
+  PyObject* get_d2_fcell_images();
+  PyObject* get_d_eta_images();
+  PyObject* get_d2_eta_images();
+  PyObject* get_d_lambda_images();
+  PyObject* get_d2_lambda_images();
+  PyObject* get_d_panel_rot_images();
+  PyObject* get_d2_panel_rot_images();
+  PyObject* get_d_panel_orig_images();
+  PyObject* get_d2_panel_orig_images();
+  PyObject* get_d_fp_fdp_images();
+  PyObject* get_Fhkl_scale_deriv();
 #endif
   boost::python::tuple get_diffuse_gamma_derivative_pixels();
   boost::python::tuple get_diffuse_sigma_derivative_pixels();
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
index 3420469ed8..016f684d05 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
@@ -43,10 +43,6 @@ uint32_t combine_refinement_flags(flags& db_flags) {
     return refine_flag;
 }
 
-DLManagedTensor* diffBraggKOKKOS::get_d_Ncells_images() {
-    return kokkostbx::view_to_dlpack(m_d_Ncells_images);
-}
-
 void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     int Npix_to_model,
     std::vector<unsigned int>& panels_fasts_slows,
@@ -643,3 +639,91 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
 
     Kokkos::Tools::popRegion();
 }
+
+DLManagedTensor* diffBraggKOKKOS::get_floatimage() {
+    return kokkostbx::view_to_dlpack(m_floatimage);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_wavelenimage() {
+    return kokkostbx::view_to_dlpack(m_wavelenimage);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_diffuse_gamma_images() {
+    return kokkostbx::view_to_dlpack(m_d_diffuse_gamma_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_diffuse_sigma_images() {
+    return kokkostbx::view_to_dlpack(m_d_diffuse_sigma_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_Umat_images() {
+    return kokkostbx::view_to_dlpack(m_d_Umat_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_Umat_images() {
+    return kokkostbx::view_to_dlpack(m_d2_Umat_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_Bmat_images() {
+    return kokkostbx::view_to_dlpack(m_d_Bmat_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_Bmat_images() {
+    return kokkostbx::view_to_dlpack(m_d2_Bmat_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_Ncells_images() {
+    return kokkostbx::view_to_dlpack(m_d_Ncells_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_Ncells_images() {
+    return kokkostbx::view_to_dlpack(m_d2_Ncells_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_fcell_images() {
+    return kokkostbx::view_to_dlpack(m_d_fcell_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_fcell_images() {
+    return kokkostbx::view_to_dlpack(m_d2_fcell_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_eta_images() {
+    return kokkostbx::view_to_dlpack(m_d_eta_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_eta_images() {
+    return kokkostbx::view_to_dlpack(m_d2_eta_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_lambda_images() {
+    return kokkostbx::view_to_dlpack(m_d_lambda_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_lambda_images() {
+    return kokkostbx::view_to_dlpack(m_d2_lambda_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_panel_rot_images() {
+    return kokkostbx::view_to_dlpack(m_d_panel_rot_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_panel_rot_images() {
+    return kokkostbx::view_to_dlpack(m_d2_panel_rot_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_panel_orig_images() {
+    return kokkostbx::view_to_dlpack(m_d_panel_orig_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d2_panel_orig_images() {
+    return kokkostbx::view_to_dlpack(m_d2_panel_orig_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_d_fp_fdp_images() {
+    return kokkostbx::view_to_dlpack(m_d_fp_fdp_images);
+}
+
+DLManagedTensor* diffBraggKOKKOS::get_Fhkl_scale_deriv() {
+    return kokkostbx::view_to_dlpack(m_Fhkl_scale_deriv);
+}
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.h b/simtbx/diffBragg/src/diffBraggKOKKOS.h
index a88ef5d4d3..78ed67a505 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.h
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.h
@@ -149,7 +149,28 @@ class diffBraggKOKKOS {
         // diffBragg_kokkosPointers& kp,
         timer_variables& TIMERS);
 
+    DLManagedTensor* get_floatimage();
+    DLManagedTensor* get_wavelenimage();
+    DLManagedTensor* get_d_diffuse_gamma_images();
+    DLManagedTensor* get_d_diffuse_sigma_images();
+    DLManagedTensor* get_d_Umat_images();
+    DLManagedTensor* get_d2_Umat_images();
+    DLManagedTensor* get_d_Bmat_images();
+    DLManagedTensor* get_d2_Bmat_images();
     DLManagedTensor* get_d_Ncells_images();
+    DLManagedTensor* get_d2_Ncells_images();
+    DLManagedTensor* get_d_fcell_images();
+    DLManagedTensor* get_d2_fcell_images();
+    DLManagedTensor* get_d_eta_images();
+    DLManagedTensor* get_d2_eta_images();
+    DLManagedTensor* get_d_lambda_images();
+    DLManagedTensor* get_d2_lambda_images();
+    DLManagedTensor* get_d_panel_rot_images();
+    DLManagedTensor* get_d2_panel_rot_images();
+    DLManagedTensor* get_d_panel_orig_images();
+    DLManagedTensor* get_d2_panel_orig_images();
+    DLManagedTensor* get_d_fp_fdp_images();
+    DLManagedTensor* get_Fhkl_scale_deriv();
 };
 
 #endif
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 1245730aa7..84aad9b7de 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -733,7 +733,6 @@ struct DLPackAPI {
       .def("set_ncells_values", &simtbx::nanoBragg::diffBragg::set_ncells_values, "set Ncells values as a 3-tuple (Na, Nb, Nc)")
 
       .def("get_ncells_values", &simtbx::nanoBragg::diffBragg::get_ncells_values, "get Ncells values as a 3-tuple (Na, Nb, Nc)")
-      .def("get_d_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d_Ncells_images, "get DLPackTensor for d_Ncells_images; pot. on GPU")
 
       .def("add_diffBragg_spots_full", &simtbx::nanoBragg::diffBragg::add_diffBragg_spots_full, "forward model and gradients at every pixel")
 
@@ -1093,6 +1092,52 @@ struct DLPackAPI {
                     make_function(&set_beams,dcp()),
                     "list of dxtbx::Beam objects corresponding to each zero-divergence and monochromatic x-ray point source in the numerical simulation ")
 
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+      .def("get_floatimage", &simtbx::nanoBragg::diffBragg::get_floatimage, "get DLPackTensor for floatimage; pot. on GPU")
+
+      .def("get_wavelenimage", &simtbx::nanoBragg::diffBragg::get_wavelenimage, "get DLPackTensor for wavelenimage; pot. on GPU")
+
+      .def("get_d_diffuse_gamma_images", &simtbx::nanoBragg::diffBragg::get_d_diffuse_gamma_images, "get DLPackTensor for d_diffuse_gamma_images; pot. on GPU")
+
+      .def("get_d_diffuse_sigma_images", &simtbx::nanoBragg::diffBragg::get_d_diffuse_sigma_images, "get DLPackTensor for d_diffuse_sigma_images; pot. on GPU")
+
+      .def("get_d_Umat_images", &simtbx::nanoBragg::diffBragg::get_d_Umat_images, "get DLPackTensor for d_Umat_images; pot. on GPU")
+
+      .def("get_d2_Umat_images", &simtbx::nanoBragg::diffBragg::get_d2_Umat_images, "get DLPackTensor for d2_Umat_images; pot. on GPU")
+
+      .def("get_d_Bmat_images", &simtbx::nanoBragg::diffBragg::get_d_Bmat_images, "get DLPackTensor for d_Bmat_images; pot. on GPU")
+
+      .def("get_d2_Bmat_images", &simtbx::nanoBragg::diffBragg::get_d2_Bmat_images, "get DLPackTensor for d2_Bmat_images; pot. on GPU")
+
+      .def("get_d_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d_Ncells_images, "get DLPackTensor for d_Ncells_images; pot. on GPU")
+
+      .def("get_d2_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d2_Ncells_images, "get DLPackTensor for d2_Ncells_images; pot. on GPU")
+
+      .def("get_d_fcell_images", &simtbx::nanoBragg::diffBragg::get_d_fcell_images, "get DLPackTensor for d_fcell_images; pot. on GPU")
+
+      .def("get_d2_fcell_images", &simtbx::nanoBragg::diffBragg::get_d2_fcell_images, "get DLPackTensor for d2_fcell_images; pot. on GPU")
+
+      .def("get_d_eta_images", &simtbx::nanoBragg::diffBragg::get_d_eta_images, "get DLPackTensor for d_eta_images; pot. on GPU")
+
+      .def("get_d2_eta_images", &simtbx::nanoBragg::diffBragg::get_d2_eta_images, "get DLPackTensor for d2_eta_images; pot. on GPU")
+
+      .def("get_d_lambda_images", &simtbx::nanoBragg::diffBragg::get_d_lambda_images, "get DLPackTensor for d_lambda_images; pot. on GPU")
+
+      .def("get_d2_lambda_images", &simtbx::nanoBragg::diffBragg::get_d2_lambda_images, "get DLPackTensor for d2_lambda_images; pot. on GPU")
+
+      .def("get_d_panel_rot_images", &simtbx::nanoBragg::diffBragg::get_d_panel_rot_images, "get DLPackTensor for d_panel_rot_images; pot. on GPU")
+
+      .def("get_d2_panel_rot_images", &simtbx::nanoBragg::diffBragg::get_d2_panel_rot_images, "get DLPackTensor for d2_panel_rot_images; pot. on GPU")
+
+      .def("get_d_panel_orig_images", &simtbx::nanoBragg::diffBragg::get_d_panel_orig_images, "get DLPackTensor for d_panel_orig_images; pot. on GPU")
+
+      .def("get_d2_panel_orig_images", &simtbx::nanoBragg::diffBragg::get_d2_panel_orig_images, "get DLPackTensor for d2_panel_orig_images; pot. on GPU")
+
+      .def("get_d_fp_fdp_images", &simtbx::nanoBragg::diffBragg::get_d_fp_fdp_images, "get DLPackTensor for d_fp_fdp_images; pot. on GPU")
+
+      .def("get_Fhkl_scale_deriv", &simtbx::nanoBragg::diffBragg::get_Fhkl_scale_deriv, "get DLPackTensor for Fhkl_scale_deriv; pot. on GPU")
+#endif
+
     ; // end of diffBragg extention
 
   } // end of diffBragg_init_module

From db99d1d10fcbe1b2381c23a853f581540276f674 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Thu, 15 Feb 2024 16:39:59 -0800
Subject: [PATCH 06/17] clean clutter

---
 kokkostbx/kokkos_dlpack.h              | 4 ++--
 simtbx/diffBragg/src/diffBragg.cpp     | 2 +-
 simtbx/diffBragg/src/diffBragg_ext.cpp | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kokkostbx/kokkos_dlpack.h b/kokkostbx/kokkos_dlpack.h
index eb00e79fda..dedddf6264 100644
--- a/kokkostbx/kokkos_dlpack.h
+++ b/kokkostbx/kokkos_dlpack.h
@@ -54,7 +54,7 @@ DLDevice getDLPackDevice() {
       return {kDLROCM, device_id};
   } else if (std::is_same<SpaceType, Kokkos::HIPHostPinnedSpace>::value) {
       return {kDLROCMHost, device_id};
-  } 
+  }
 #endif
   else {
       // Extend to other device types as needed
@@ -75,7 +75,7 @@ DLManagedTensor* view_to_dlpack(Kokkos::View<DataType, SpaceType>& view) {
   DLManagedTensor* dlpackTensor = new DLManagedTensor;
   dlpackTensor->dl_tensor.data = view.data();
   dlpackTensor->dl_tensor.device = getDLPackDevice<SpaceType>();
-  dlpackTensor->dl_tensor.ndim = numDims;    
+  dlpackTensor->dl_tensor.ndim = numDims;
   dlpackTensor->dl_tensor.dtype = getDLPackDataType<DataType, SpaceType>();
   dlpackTensor->dl_tensor.shape = shape;
   dlpackTensor->dl_tensor.strides = nullptr;
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 4aa58e692f..4ede77192d 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1526,7 +1526,7 @@ PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func
     if (diffBragg_runner == nullptr) {
         return nullptr;
     }
-    return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);   
+    return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);
 }
 
 PyObject* diffBragg::get_floatimage() {
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 84aad9b7de..1cac964785 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -611,7 +611,7 @@ struct DLPackAPI {
     dlpackTensor->dl_tensor.data = static_cast<void*>(&container);
     dlpackTensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
     dlpackTensor->dl_tensor.device.device_id = 0;
-    dlpackTensor->dl_tensor.ndim = numDims;    
+    dlpackTensor->dl_tensor.ndim = numDims;
     dlpackTensor->dl_tensor.dtype = getDLPackDataType();
     dlpackTensor->dl_tensor.shape = shape;
     dlpackTensor->dl_tensor.strides = nullptr;
@@ -634,7 +634,7 @@ struct DLPackAPI {
     dtype.bits = sizeof(double) * 8;
     dtype.lanes = 1;
     return dtype;
-  }  
+  }
 
   void print_hello() {
     std::cout << "Hello Python!" << std::endl;

From 62c95849865768ca28b8516c2520505919d6443b Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Thu, 15 Feb 2024 17:45:32 -0800
Subject: [PATCH 07/17] clean clutter

---
 libtbx/citations.params | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libtbx/citations.params b/libtbx/citations.params
index 3b10bf44aa..0a4a3b2d0a 100644
--- a/libtbx/citations.params
+++ b/libtbx/citations.params
@@ -484,12 +484,12 @@ citation {
   article_id = mmseqs2b
   authors = Mirdita M, Steinegger M, Söding J
   title = MMseqs2 desktop and local web server app for fast interactive sequence searches
-  journal = Bioinformatics 
+  journal = Bioinformatics
   volume = 35
-  pages = 2856-2858 
+  pages = 2856-2858
   year = 2019
   doi_id =  10.1093/bioinformatics/bty1057
-  pmid = 30615063 
+  pmid = 30615063
 }
 citation {
   article_id = mmseqs2

From c4c702023bd248ae5d868c81b9c22d97ea1dd52e Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Tue, 20 Feb 2024 11:16:20 -0800
Subject: [PATCH 08/17] add kokkos_device function

---
 simtbx/diffBragg/src/diffBragg_ext.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index 1cac964785..dd1fe393b8 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -504,6 +504,22 @@ namespace boost_python { namespace {
         return boost::python::make_tuple(diffBragg.pythony_indices,diffBragg.pythony_amplitudes);
   }
 
+std::string kokkos_device() {
+  std::string backend = "cpu:0";
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+  if (Kokkos::is_finalized()) {
+    throw std::runtime_error("Error: Kokkos has been finalized.\n");
+  }
+  if (!Kokkos::is_initialized()) {
+    throw std::runtime_error("Error: Kokkos not initialized.\n");
+  }
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  backend = "cuda:" + std::to_string( Kokkos::device_id() );
+#endif
+#endif
+  return backend;
+}
+
 #ifdef DIFFBRAGG_HAVE_KOKKOS
   void finalize_kokkos(){
     Kokkos::finalize();
@@ -672,6 +688,8 @@ struct DLPackAPI {
     def("initialize_kokkos", initialize_kokkos,
         "the sole argument `dev` (an int from 0 to Ngpu-1) is passed to Kokkos::initialize()");
 
+    def("kokkos_device", kokkos_device, "returns kokkos device for use in PyTorch");
+
     def("print_dlpack",PrintDLTensorParameters,"Print information about a dlpack");
 
     // def("get_d_Ncells_images", &get_dlpack, "Return DLPackTensor for d_Ncells_images; pot. on GPU")

From b8de3fb3104a1246b6776e4a44cacec067f3e071 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Tue, 5 Mar 2024 10:32:33 -0800
Subject: [PATCH 09/17] update diffBragg:model() to use pytorch

---
 simtbx/diffBragg/hopper_utils.py | 42 +++++++++++++++-----------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 5acd3ced80..43f844e17b 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -2,6 +2,7 @@
 import time
 import os
 import json
+import torch
 from dials.algorithms.shoebox import MaskCode
 from copy import deepcopy
 from dials.model.data import Shoebox
@@ -24,7 +25,7 @@
 from simtbx.diffBragg import utils
 from simtbx.diffBragg.refiners.parameters import RangedParameter, Parameters, PositiveParameter
 from simtbx.diffBragg.attr_list import NB_BEAM_ATTRS, NB_CRYST_ATTRS, DIFFBRAGG_ATTRS
-from simtbx.diffBragg import psf
+from simtbx.diffBragg import psf, kokkos_device
 
 try:
     from line_profiler import LineProfiler
@@ -1570,7 +1571,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
     J = None
     if compute_grad:
         # This should be all params save the Fhkl params
-        J = np.zeros((nparam-SIM.Num_ASU*SIM.num_Fhkl_channels, npix))  # gradients
+        J = torch.zeros((nparam-SIM.Num_ASU*SIM.num_Fhkl_channels, npix), device=kokkos_device())  # gradients
 
     model_pix = None
     #TODO check roiScales mode and if its broken, git rid of it!
@@ -1581,7 +1582,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
     if not Mod.params.fix.perRoiScale:
         perRoiParams = [Mod.P["scale_roi%d" % roi_id] for roi_id in Mod.roi_id_unique]
         perRoiScaleFactors = [p.get_val(x[p.xpos]) for p in perRoiParams]
-        roiScalesPerPix = np.zeros(npix)
+        roiScalesPerPix = torch.zeros(npix, device=kokkos_device())
         for i_roi, roi_id in enumerate(Mod.roi_id_unique):
             slc = Mod.roi_id_slices[roi_id][0]
             roiScalesPerPix[slc] = perRoiScaleFactors[i_roi]
@@ -1615,8 +1616,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
 
         SIM.D.add_diffBragg_spots(pfs)
 
-        pix_noRoiScale = SIM.D.raw_pixels_roi[:npix]
-        pix_noRoiScale = pix_noRoiScale.as_numpy_array()
+        pix_noRoiScale = torch.from_dlpack(SIM.D.get_floatimage())[:npix]
 
         pix = pix_noRoiScale * roiScalesPerPix
 
@@ -1635,15 +1635,15 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
 
             if RotXYZ_params[0].refine:
                 for i_rot in range(3):
-                    rot_grad = scale * SIM.D.get_derivative_pixels(ROTXYZ_IDS[i_rot]).as_numpy_array()[:npix]
+                    rot_grad = scale * torch.from_dlpack(SIM.D.get_d_Umat_images())[:npix]
                     rot_p = RotXYZ_params[i_rot]
                     rot_grad = rot_p.get_deriv(x[rot_p.xpos], rot_grad)
                     J[rot_p.xpos] += rot_grad
 
             if Nabc_params[0].refine:
-                Nabc_grads = SIM.D.get_ncells_derivative_pixels()
+                Nabc_grads = scale * torch.from_dlpack(SIM.D.get_d_Ncells_images())[:3*npix]
                 for i_n in range(3):
-                    N_grad = scale*(Nabc_grads[i_n][:npix].as_numpy_array())
+                    N_grad = Nabc_grads[i_n*npix:(i_n+1)*npix]
                     p = Nabc_params[i_n]
                     N_grad = p.get_deriv(x[p.xpos], N_grad)
                     J[p.xpos] += N_grad
@@ -1651,53 +1651,51 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
                         break
 
             if Ndef_params[0].refine:
-                Ndef_grads = SIM.D.get_ncells_def_derivative_pixels()
+                Ndef_grads = scale * torch.from_dlpack(SIM.D.get_d_Ncells_images())[3*npix:]
                 for i_n in range(3):
-                    N_grad = scale * (Ndef_grads[i_n][:npix].as_numpy_array())
+                    N_grad = Ndef_grads[i_n*npix:(i_n+1)*npix]
                     p = Ndef_params[i_n]
                     N_grad = p.get_deriv(x[p.xpos], N_grad)
                     J[p.xpos] += N_grad
 
             if SIM.D.use_diffuse:
                 for t in ['gamma','sigma']:
-                    diffuse_grads = getattr(SIM.D, "get_diffuse_%s_derivative_pixels" % t)()
+                    diffuse_grads = scale * torch.from_dlpack( getattr(SIM.D, "get_d_diffuse_%s_images" % t)() )
                     if diffuse_params_lookup[t][0].refine:
                         for i_diff in range(3):
-                            diff_grad = scale*(diffuse_grads[i_diff][:npix].as_numpy_array())
+                            diff_grad = diffuse_grads[i_diff*npix:(i_diff+1)*npix]
                             p = diffuse_params_lookup[t][i_diff]
                             diff_grad = p.get_deriv(x[p.xpos], diff_grad)
                             J[p.xpos] += diff_grad
 
             if eta_params[0].refine:
-                if SIM.D.has_anisotropic_mosaic_spread:
-                    eta_derivs = SIM.D.get_aniso_eta_deriv_pixels()
-                else:
-                    eta_derivs = [SIM.D.get_derivative_pixels(ETA_ID)]
+                eta_derivs = scale * torch.from_dlpack(SIM.D.get_d_eta_images())
                 num_eta = 3 if SIM.D.has_anisotropic_mosaic_spread else 1
                 for i_eta in range(num_eta):
                     p = eta_params[i_eta]
-                    eta_grad = scale * (eta_derivs[i_eta][:npix].as_numpy_array())
+                    eta_grad = eta_derivs[i_eta*npix:(i_eta+1)*npix]
                     eta_grad = p.get_deriv(x[p.xpos], eta_grad)
                     J[p.xpos] += eta_grad
 
             if ucell_params[0].refine:
+                ucell_grads = scale * torch.from_dlpack(SIM.D.get_d_Umat_images())
                 for i_ucell in range(nucell):
                     p = ucell_params[i_ucell]
-                    deriv = scale*SIM.D.get_derivative_pixels(UCELL_ID_OFFSET+i_ucell).as_numpy_array()[:npix]
+                    deriv = ucell_grads[i_ucell*npix: (i_ucell+1)*npix]
                     deriv = p.get_deriv(x[p.xpos], deriv)
                     J[p.xpos] += deriv
 
             if DetZ.refine:
-                d = SIM.D.get_derivative_pixels(DETZ_ID).as_numpy_array()[:npix]
+                d = torch.from_dlpack(SIM.D.get_d_panel_orig_images())[npix:2*npix]
                 d = DetZ.get_deriv(x[DetZ.xpos], d)
                 J[DetZ.xpos] += d
 
             if Mod.P["lambda_offset"].refine:
-                lambda_derivs = SIM.D.get_lambda_derivative_pixels()
+                lambda_derivs = torch.from_dlpack(SIM.D.get_d_lambda_images())
                 lambda_param_names = "lambda_offset", "lambda_scale"
-                for d,name in zip(lambda_derivs, lambda_param_names):
+                for i_lmbd,name in enumerate(lambda_param_names):
                     p = Mod.P[name]
-                    d = d.as_numpy_array()[:npix]
+                    d = lambda_derivs[i_lmbd*npix:(i_lmbd+1)*npix]
                     d = p.get_deriv(x[p.xpos], d)
                     J[p.xpos] += d
 

From 9187961ff5820bd93f132a5d89ffdf4c5a8a88a2 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Tue, 5 Mar 2024 14:46:10 -0800
Subject: [PATCH 10/17] Update diffBragg:DataModeler to use pytorch

---
 simtbx/diffBragg/hopper_utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 43f844e17b..a3440ec0d8 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -693,16 +693,17 @@ def data_to_one_dim(self, img_data, is_trusted, background):
         self.all_q_perpix = np.array(all_q_perpix)
         pan_fast_slow = np.ascontiguousarray((np.vstack([all_pid, all_fast, all_slow]).T).ravel())
         self.pan_fast_slow = flex.size_t(pan_fast_slow)
-        self.all_background = np.array(all_background)
+        self.all_background = torch.tensor(all_background, device=kokkos_device())
         self.roi_id = np.array(roi_id)
-        self.all_data = np.array(all_data)
+        self.all_data = torch.tensor(all_data, device=kokkos_device())
         if np.allclose(all_sigma_rdout, self.nominal_sigma_rdout):
             self.all_sigma_rdout = self.nominal_sigma_rdout
         else:
             self.all_sigma_rdout = np.array(all_sigma_rdout)
-        self.all_sigmas = np.array(all_sigmas)
+        self.all_sigmas = torch.tensor(all_sigmas, device=kokkos_device())
         # note rare chance for sigmas to be nan if the args of sqrt is below 0
-        self.all_trusted = np.logical_and(np.array(all_trusted), ~np.isnan(all_sigmas))
+        all_trusted = torch.tensor(all_trusted, device=kokkos_device())
+        self.all_trusted = torch.logical_and(all_trusted, ~torch.isnan(self.all_sigmas))
 
         if self.params.roi.skip_roi_with_negative_bg:
             # Dont include pixels whose background model is below 0
@@ -1834,7 +1835,7 @@ def __call__(self, x, *args, **kwargs):
         self.old_J = J
         self.iteration += 1
         self.g = g
-        return f
+        return f.cpu()
 
 
 def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores=False):
@@ -1905,14 +1906,14 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores
     V = model_pix + sigma_rdout**2
     # TODO:what if V is allowed to be negative? The logarithm/sqrt will explore below
     resid_square = resid**2
-    fLogLike = (.5*(np.log(2*np.pi*V) + resid_square / V))
+    fLogLike = (.5*(torch.log(2*torch.pi*V) + resid_square / V))
     if params.roi.allow_overlapping_spots:
         fLogLike /= mod.all_freq
     fLogLike = fLogLike[trusted].sum()   # negative log Likelihood target
 
     # width of z-score should decrease as refinement proceeds
-    zscore_per = resid/np.sqrt(V)
-    zscore_sigma = np.std(zscore_per[trusted])
+    zscore_per = resid/torch.sqrt(V)
+    zscore_sigma = torch.std(zscore_per[trusted])
 
     restraint_terms = {}
     if params.use_restraints:

From 1ccab2873ceadf4d13fba2a53ca8f69299e77f60 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Wed, 6 Mar 2024 14:14:04 -0800
Subject: [PATCH 11/17] more update hopper_utils to use PyTorch

---
 simtbx/diffBragg/hopper_utils.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index a3440ec0d8..1c885d6102 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -1679,7 +1679,7 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
                     J[p.xpos] += eta_grad
 
             if ucell_params[0].refine:
-                ucell_grads = scale * torch.from_dlpack(SIM.D.get_d_Umat_images())
+                ucell_grads = scale * torch.from_dlpack(SIM.D.get_d_Bmat_images())
                 for i_ucell in range(nucell):
                     p = ucell_params[i_ucell]
                     deriv = ucell_grads[i_ucell*npix: (i_ucell+1)*npix]
@@ -1835,7 +1835,7 @@ def __call__(self, x, *args, **kwargs):
         self.old_J = J
         self.iteration += 1
         self.g = g
-        return f.cpu()
+        return f
 
 
 def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores=False):
@@ -1909,11 +1909,11 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores
     fLogLike = (.5*(torch.log(2*torch.pi*V) + resid_square / V))
     if params.roi.allow_overlapping_spots:
         fLogLike /= mod.all_freq
-    fLogLike = fLogLike[trusted].sum()   # negative log Likelihood target
+    fLogLike = fLogLike[trusted].sum().item()   # negative log Likelihood target
 
     # width of z-score should decrease as refinement proceeds
     zscore_per = resid/torch.sqrt(V)
-    zscore_sigma = torch.std(zscore_per[trusted])
+    zscore_sigma = torch.std(zscore_per[trusted]).item()
 
     restraint_terms = {}
     if params.use_restraints:
@@ -2188,12 +2188,12 @@ def get_new_xycalcs(Modeler, new_exp, old_refl_tag="dials"):
     for i_roi in range(len(bragg_subimg)):
 
         ref_idx = Modeler.refls_idx[i_roi]
-
         #assert ref_idx==i_roi
-        if np.any(bragg_subimg[i_roi] > 0):
+        if torch.any(bragg_subimg[i_roi] > 0):
             I = bragg_subimg[i_roi]
-            assert np.all(I>=0)
-            Y, X = np.indices(bragg_subimg[i_roi].shape)
+            assert torch.all(I>=0)
+            ny, nx = bragg_subimg[i_roi].shape
+            X, Y = torch.meshgrid(torch.arange(nx, device=kokkos_device()), torch.arange(ny, device=kokkos_device()), indexing='xy')
             x1, _, y1, _ = Modeler.rois[i_roi]
 
             com_x, com_y, _ = new_refls[ref_idx]["xyzobs.px.value"]
@@ -2206,11 +2206,13 @@ def get_new_xycalcs(Modeler, new_exp, old_refl_tag="dials"):
             except IndexError:
                 continue
 
-            X += x1
-            Y += y1
+            X = X + x1
+            Y = Y + y1
             Isum = I.sum()
             xcom = (X * I).sum() / Isum + .5
+            xcom = xcom.item()
             ycom = (Y * I).sum() / Isum + .5
+            ycom = ycom.item()
             com = xcom, ycom, 0
 
             pid = Modeler.pids[i_roi]

From d3907bd913e524b3905efaccbb11b31a3ccf4f2f Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Wed, 6 Mar 2024 17:29:08 -0800
Subject: [PATCH 12/17] Fix for pytorch in hopper

---
 simtbx/diffBragg/hopper_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 1c885d6102..50e0acaadb 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -684,10 +684,10 @@ def data_to_one_dim(self, img_data, is_trusted, background):
             x1, x2, y1, y2 = self.rois[i_roi]
             freq = pixel_counter[pid, y1:y2, x1:x2].ravel()
             all_freq += list(freq)
-        self.all_freq = np.array(all_freq, np.int32)  # if no overlapping pixels, this should be an array of 1's
+        self.all_freq = torch.tensor(all_freq, dtype=torch.int32, device=kokkos_device())  # if no overlapping pixels, this should be an array of 1's
         if not self.params.roi.allow_overlapping_spots:
-            if not np.all(self.all_freq==1):
-                print(set(self.all_freq))
+            if not torch.all(self.all_freq==1):
+                print(set(self.all_freq.cpu().numpy()))
                 raise ValueError("There are overlapping regions of interest, despite the command to not allow overlaps")
 
         self.all_q_perpix = np.array(all_q_perpix)
@@ -1635,8 +1635,9 @@ def model(x, Mod, SIM,  compute_grad=True, dont_rescale_gradient=False, update_s
                 J[G.xpos] += scale_grad
 
             if RotXYZ_params[0].refine:
+                rot_grads = scale * torch.from_dlpack(SIM.D.get_d_Umat_images())
                 for i_rot in range(3):
-                    rot_grad = scale * torch.from_dlpack(SIM.D.get_d_Umat_images())[:npix]
+                    rot_grad = rot_grads[i_rot*npix:(i_rot+1)*npix]
                     rot_p = RotXYZ_params[i_rot]
                     rot_grad = rot_p.get_deriv(x[rot_p.xpos], rot_grad)
                     J[rot_p.xpos] += rot_grad

From 5a2893517295240cb2af2f5c46e191f1c5aa1ff6 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Mon, 11 Mar 2024 08:48:11 -0700
Subject: [PATCH 13/17] updates to hopper_utils for pytorch

---
 simtbx/diffBragg/hopper_ensemble_utils.py |  9 +++--
 simtbx/diffBragg/hopper_utils.py          | 49 +++++++++++------------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/simtbx/diffBragg/hopper_ensemble_utils.py b/simtbx/diffBragg/hopper_ensemble_utils.py
index f15f608d10..16976e754a 100644
--- a/simtbx/diffBragg/hopper_ensemble_utils.py
+++ b/simtbx/diffBragg/hopper_ensemble_utils.py
@@ -5,6 +5,7 @@
 import socket
 import logging
 import os
+import torch
 import numpy as np
 from scipy.optimize import basinhopping
 
@@ -105,7 +106,7 @@ def target_func(x, modelers):
 
     f = 0  # target functional
     g = np.zeros(modelers.num_total_modelers * num_shot_params)
-    g_fhkl = np.zeros(num_fhkl_params)
+    g_fhkl = torch.zeros(num_fhkl_params)
     zscore_sigs = []
     fcell_params = x[-num_fhkl_params:]
     for ii, i_shot in enumerate(modelers):
@@ -126,13 +127,13 @@ def target_func(x, modelers):
         # data contributions to target function
         V = model_pix + shot_modeler.all_sigma_rdout**2
         resid_square = resid**2
-        shot_fLogLike = (.5*(np.log(2*np.pi*V) + resid_square / V))
+        shot_fLogLike = (.5*(torch.log(2*np.pi*V) + resid_square / V))
         if shot_modeler.params.roi.allow_overlapping_spots:
             shot_fLogLike /= shot_modeler.all_freq
         shot_fLogLike = shot_fLogLike[shot_modeler.all_trusted].sum()   # negative log Likelihood target
         f += shot_fLogLike
 
-        zscore_sig = np.std((resid / np.sqrt(V))[shot_modeler.all_trusted])
+        zscore_sig = torch.std((resid / torch.sqrt(V))[shot_modeler.all_trusted]).item()
         zscore_sigs.append(zscore_sig)
 
         # get this shots contribution to the gradient
@@ -145,7 +146,7 @@ def target_func(x, modelers):
         for name in shot_modeler.non_fhkl_params:
             p = shot_modeler.P[name]
             Jac_p = Jac[p.xpos]
-            shot_g[p.xpos] += (Jac_p[shot_modeler.all_trusted] * common_grad_term).sum()
+            shot_g[p.xpos] += (Jac_p[shot_modeler.all_trusted] * common_grad_term).sum().item()
         np.add.at(g, shot_x_slice, shot_g)
 
         spot_scale_p = shot_modeler.P["G_xtal0"]
diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 50e0acaadb..e3138da67c 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -730,18 +730,18 @@ def dump_gathered_to_refl(self, output_name, do_xyobs_sanity_check=False):
             roi_sel = self.roi_id==i_roi
             x1, x2, y1, y2 = self.rois[i_roi]
             roi_shape = y2-y1, x2-x1
-            roi_img = self.all_data[roi_sel].reshape(roi_shape).astype(np.float32)  #NOTE this has already been converted to photon units
-            roi_bg = self.all_background[roi_sel].reshape(roi_shape).astype(np.float32)
+            roi_img = self.all_data[roi_sel].reshape(roi_shape).float()  #NOTE this has already been converted to photon units
+            roi_bg = self.all_background[roi_sel].reshape(roi_shape).float()
 
             sb = Shoebox((x1, x2, y1, y2, 0, 1))
             sb.allocate()
-            sb.data = flex.float(np.ascontiguousarray(roi_img[None]))
-            sb.background = flex.float(np.ascontiguousarray(roi_bg[None]))
+            sb.data = flex.float(roi_img[None].cpu().contiguous().numpy())
+            sb.background = flex.float(roi_bg[None].cpu().contiguous().numpy())
 
-            dials_mask = np.zeros(roi_img.shape).astype(np.int32)
+            dials_mask = torch.zeros(roi_img.shape, device=kokkos_device()).int()
             mask = self.all_trusted[roi_sel].reshape(roi_shape)
             dials_mask[mask] = dials_mask[mask] + MaskCode.Valid
-            sb.mask = flex.int(np.ascontiguousarray(dials_mask[None]))
+            sb.mask = flex.int(dials_mask[None].cpu().contiguous().numpy())
 
             # quick sanity test
             if do_xyobs_sanity_check:
@@ -1250,7 +1250,7 @@ def save_up(self, x, SIM, rank=0, i_shot=0,
             SIM.D.force_cpu = True
             MAIN_LOGGER.info("Getting Fhkl errors (forcing CPUkernel usage)... might take some time")
             Fhkl_scale_errors = SIM.D.add_Fhkl_gradients(
-                self.pan_fast_slow, resid, V, self.all_trusted, self.all_freq,
+                self.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), self.all_trusted.cpu().numpy(), self.all_freq.cpu().numpy(),
                 SIM.num_Fhkl_channels, G, track=True, errors=True)
             SIM.D.force_gpu = force_cpu
             # ------------
@@ -1338,21 +1338,22 @@ def save_up(self, x, SIM, rank=0, i_shot=0,
                 fit = model_subimg[i_roi]
                 trust = trusted_subimg[i_roi]
                 if sigma_rdout_subimg is not None:
-                    sig = np.sqrt(fit + sigma_rdout_subimg[i_roi] ** 2)
+                    sig = torch.sqrt(fit + sigma_rdout_subimg[i_roi] ** 2)
                 else:
-                    sig = np.sqrt(fit + Modeler.nominal_sigma_rdout ** 2)
+                    sig = torch.sqrt(fit + Modeler.nominal_sigma_rdout ** 2)
                 Z = (dat - fit) / sig
                 sigmaZ = np.nan
-                if np.any(trust):
-                    sigmaZ = Z[trust].std()
+                if torch.any(trust):
+                    sigmaZ = Z[trust].std().item()
 
                 sigmaZs.append(sigmaZ)
                 if bragg_subimg[0] is not None:
-                    if np.any(bragg_subimg[i_roi] > 0):
+                    if torch.any(bragg_subimg[i_roi] > 0):
                         ref_idx = Modeler.refls_idx[i_roi]
                         ref = Modeler.refls[ref_idx]
                         I = bragg_subimg[i_roi]
-                        Y, X = np.indices(bragg_subimg[i_roi].shape)
+                        ny, nx = bragg_subimg[i_roi].shape
+                        Y, X = torch.meshgrid(torch.arange(ny, device=kokkos_device()), torch.arange(nx, device=kokkos_device()), indexing='ij')
                         x1, x2, y1, y2 = Modeler.rois[i_roi]
                         com_x, com_y, _ = ref["xyzobs.px.value"]
                         com_x = int(com_x - x1 - 0.5)
@@ -1363,11 +1364,11 @@ def save_up(self, x, SIM, rank=0, i_shot=0,
                                 continue
                         except IndexError:
                             continue
-                        X += x1
-                        Y += y1
-                        Isum = I.sum()
-                        xcom = (X * I).sum() / Isum
-                        ycom = (Y * I).sum() / Isum
+                        X = X + x1
+                        Y = Y + y1
+                        Isum = I.sum().item()
+                        xcom = (X * I).sum().item() / Isum
+                        ycom = (Y * I).sum().item() / Isum
                         com = xcom + .5, ycom + .5, 0
                         new_xycalcs[ref_idx] = com
                         if not Modeler.params.fix.perRoiScale:
@@ -2041,8 +2042,8 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores
         if SIM.refining_Fhkl:
             spot_scale_p = mod.P["G_xtal0"]
             G = spot_scale_p.get_val(x[spot_scale_p.xpos])
-            fhkl_grad = SIM.D.add_Fhkl_gradients(pfs, resid, V, trusted,
-                                                 mod.all_freq, SIM.num_Fhkl_channels, G)
+            fhkl_grad = SIM.D.add_Fhkl_gradients(pfs, resid.cpu().numpy(), V.cpu().numpy(), trusted.cpu().numpy(),
+                                                 mod.all_freq.cpu().numpy(), SIM.num_Fhkl_channels, G)
 
             if params.betas.Fhkl is not None:
                 for i_chan in range(SIM.num_Fhkl_channels):
@@ -2209,11 +2210,9 @@ def get_new_xycalcs(Modeler, new_exp, old_refl_tag="dials"):
 
             X = X + x1
             Y = Y + y1
-            Isum = I.sum()
-            xcom = (X * I).sum() / Isum + .5
-            xcom = xcom.item()
-            ycom = (Y * I).sum() / Isum + .5
-            ycom = ycom.item()
+            Isum = I.sum().item()
+            xcom = (X * I).sum().item() / Isum + .5
+            ycom = (Y * I).sum().item() / Isum + .5
             com = xcom, ycom, 0
 
             pid = Modeler.pids[i_roi]

From ae85f39b04fc9da70e404a1bdbcd48dcf91bef32 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Thu, 14 Mar 2024 14:04:44 -0700
Subject: [PATCH 14/17] Add host_transfer flag to toggle D2H copies

---
 simtbx/diffBragg/attr_list.py            |   1 +
 simtbx/diffBragg/hopper_utils.py         |   8 +
 simtbx/diffBragg/src/diffBragg.cpp       | 205 +++++++++++++++--------
 simtbx/diffBragg/src/diffBragg.h         |   5 +-
 simtbx/diffBragg/src/diffBraggKOKKOS.cpp |  96 +++++------
 simtbx/diffBragg/src/diffBragg_ext.cpp   |  12 +-
 simtbx/diffBragg/src/util.h              |  45 +++++
 7 files changed, 244 insertions(+), 128 deletions(-)

diff --git a/simtbx/diffBragg/attr_list.py b/simtbx/diffBragg/attr_list.py
index f961528cd0..fdf7d65459 100644
--- a/simtbx/diffBragg/attr_list.py
+++ b/simtbx/diffBragg/attr_list.py
@@ -29,6 +29,7 @@
  'fluence',
  'flux',
  'has_anisotropic_mosaic_spread',
+ 'host_transfer',
  'interpolate',
  'isotropic_ncells',
  'lambda_coefficients',
diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index e3138da67c..5c647f6729 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -2081,6 +2081,11 @@ def refine(exp, ref, params, spec=None, gpu_device=None, return_modeler=False, b
     SIM = get_simulator_for_data_modelers(Modeler)
     Modeler.set_parameters_for_experiment(best=best)
     SIM.D.device_Id = gpu_device
+    old_transfer = None
+    if os.environ.get("DIFFBRAGG_USE_KOKKOS") is not None:
+        if SIM.D.host_transfer == True:
+            old_transfer = True
+            SIM.D.host_transfer = False
 
     nparam = len(Modeler.P)
     if SIM.refining_Fhkl:
@@ -2108,6 +2113,9 @@ def refine(exp, ref, params, spec=None, gpu_device=None, return_modeler=False, b
     if free_mem:
         Modeler.clean_up(SIM)
 
+    if old_transfer is not None:
+        SIM.D.host_transfer = old_transfer
+
     if return_modeler:
         return new_exp, new_refl, Modeler, SIM, x
 
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 4ede77192d..3bd9821444 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -364,6 +364,7 @@ diffBragg::diffBragg(const dxtbx::model::Detector& detector, const dxtbx::model:
 
     O_reference <<0,0,0;
 
+    host_transfer = true;
     update_oversample_during_refinement = true;
     oversample_omega = true;
     only_save_omega_kahn = false;
@@ -1508,7 +1509,6 @@ boost::python::tuple diffBragg::get_ncells_derivative_pixels(){
     return derivative_pixels;
 }
 
-#ifdef DIFFBRAGG_HAVE_KOKKOS
 void dlpack_destructor(PyObject* capsule) {
     if (!PyCapsule_IsValid(capsule, "dltensor")) {
         return;
@@ -1530,7 +1530,12 @@ PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func
 }
 
 PyObject* diffBragg::get_floatimage() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_floatimage);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_floatimage);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(raw_pixels_roi.begin(), Npix_to_model), "dltensor", dlpack_destructor);
 }
 
 PyObject* diffBragg::get_wavelenimage() {
@@ -1546,7 +1551,12 @@ PyObject* diffBragg::get_d_diffuse_sigma_images() {
 }
 
 PyObject* diffBragg::get_d_Umat_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Umat.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_Umat_images() {
@@ -1554,15 +1564,25 @@ PyObject* diffBragg::get_d2_Umat_images() {
 }
 
 PyObject* diffBragg::get_d_Bmat_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images);
-}
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Bmat.data(), 6*Npix_to_model), "dltensor", dlpack_destructor);    
+}    
 
 PyObject* diffBragg::get_d2_Bmat_images() {
     return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images);
 }
 
 PyObject* diffBragg::get_d_Ncells_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Ncells.data(), 6*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_Ncells_images() {
@@ -1570,7 +1590,12 @@ PyObject* diffBragg::get_d2_Ncells_images() {
 }
 
 PyObject* diffBragg::get_d_fcell_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fcell.data(), Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_fcell_images() {
@@ -1578,7 +1603,12 @@ PyObject* diffBragg::get_d2_fcell_images() {
 }
 
 PyObject* diffBragg::get_d_eta_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.eta.data(), first_deriv_imgs.eta.size()), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_eta_images() {
@@ -1586,7 +1616,12 @@ PyObject* diffBragg::get_d2_eta_images() {
 }
 
 PyObject* diffBragg::get_d_lambda_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.lambda.data(), 2*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_lambda_images() {
@@ -1594,7 +1629,12 @@ PyObject* diffBragg::get_d2_lambda_images() {
 }
 
 PyObject* diffBragg::get_d_panel_rot_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.panel_rot.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_panel_rot_images() {
@@ -1602,7 +1642,12 @@ PyObject* diffBragg::get_d2_panel_rot_images() {
 }
 
 PyObject* diffBragg::get_d_panel_orig_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.panel_orig.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_d2_panel_orig_images() {
@@ -1610,13 +1655,23 @@ PyObject* diffBragg::get_d2_panel_orig_images() {
 }
 
 PyObject* diffBragg::get_d_fp_fdp_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
+    }
+#endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fp_fdp.data(), 2*Npix_to_model), "dltensor", dlpack_destructor);    
 }
 
 PyObject* diffBragg::get_Fhkl_scale_deriv() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv);
-}
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
+    }
 #endif
+    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fp_fdp.data(), first_deriv_imgs.fp_fdp.size()), "dltensor", dlpack_destructor);    
+}
+
 
 boost::python::tuple diffBragg::get_diffuse_gamma_derivative_pixels(){
     SCITBX_ASSERT(db_flags.refine_diffuse);
@@ -1958,7 +2013,6 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
 
     Npix_to_model = panels_fasts_slows.size()/3;
     SCITBX_ASSERT(Npix_to_model <= Npix_total);
-    double * floatimage_roi = raw_pixels_roi.begin();
 
     diffBragg_rot_mats();
     /* make sure we are normalizing with the right number of sub-steps */
@@ -2050,6 +2104,7 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
     db_flags.refine_fp_fdp = fp_fdp_managers[0]->refine_me;
     db_flags.use_lambda_coefficients = use_lambda_coefficients;
     db_flags.oversample_omega = oversample_omega;
+    db_flags.host_transfer = host_transfer;
     db_flags.printout_fpixel = printout_fpixel;
     db_flags.printout_spixel = printout_spixel;
     db_flags.verbose = verbose;
@@ -2278,81 +2333,84 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
 
     gettimeofday(&t1,0 );
 
-    for (int i_pix=0; i_pix< Npix_to_model; i_pix++){
-        floatimage_roi[i_pix] = image[i_pix];
+    if (db_flags.host_transfer) {
+        double * floatimage_roi = raw_pixels_roi.begin();
+        for (int i_pix=0; i_pix< Npix_to_model; i_pix++){
+            floatimage_roi[i_pix] = image[i_pix];
 
-        for (int i_rot=0; i_rot<3; i_rot++){
-            if (rot_managers[i_rot]->refine_me){
-                int idx = i_rot*Npix_to_model + i_pix;
-                rot_managers[i_rot]->increment_image(i_pix, first_deriv_imgs.Umat[idx], second_deriv_imgs.Umat[idx], compute_curvatures);
+            for (int i_rot=0; i_rot<3; i_rot++){
+                if (rot_managers[i_rot]->refine_me){
+                    int idx = i_rot*Npix_to_model + i_pix;
+                    rot_managers[i_rot]->increment_image(i_pix, first_deriv_imgs.Umat[idx], second_deriv_imgs.Umat[idx], compute_curvatures);
+                }
             }
-        }
-        for (int i_uc=0; i_uc<6; i_uc++){
-            if (ucell_managers[i_uc]->refine_me){
-                int idx = i_uc*Npix_to_model + i_pix;
-                ucell_managers[i_uc]->increment_image(i_pix, first_deriv_imgs.Bmat[idx], second_deriv_imgs.Bmat[idx], compute_curvatures);
+            for (int i_uc=0; i_uc<6; i_uc++){
+                if (ucell_managers[i_uc]->refine_me){
+                    int idx = i_uc*Npix_to_model + i_pix;
+                    ucell_managers[i_uc]->increment_image(i_pix, first_deriv_imgs.Bmat[idx], second_deriv_imgs.Bmat[idx], compute_curvatures);
+                }
             }
-        }
-        if (Ncells_managers[0]->refine_me){
-            Ncells_managers[0]->increment_image(i_pix, first_deriv_imgs.Ncells[i_pix], second_deriv_imgs.Ncells[i_pix], compute_curvatures);
-            if (! isotropic_ncells){
-                int idx= Npix_to_model+i_pix;
-                Ncells_managers[1]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
-                idx = 2*Npix_to_model + i_pix;
-                Ncells_managers[2]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
+            if (Ncells_managers[0]->refine_me){
+                Ncells_managers[0]->increment_image(i_pix, first_deriv_imgs.Ncells[i_pix], second_deriv_imgs.Ncells[i_pix], compute_curvatures);
+                if (! isotropic_ncells){
+                    int idx= Npix_to_model+i_pix;
+                    Ncells_managers[1]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
+                    idx = 2*Npix_to_model + i_pix;
+                    Ncells_managers[2]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
+                }
             }
-        }
 
-        if (refine_Ncells_def){
-            for (int i_nc =3; i_nc < 6; i_nc++){
-                int idx= i_nc*Npix_to_model+i_pix;
-                Ncells_managers[i_nc]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
+            if (refine_Ncells_def){
+                for (int i_nc =3; i_nc < 6; i_nc++){
+                    int idx= i_nc*Npix_to_model+i_pix;
+                    Ncells_managers[i_nc]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures);
+                }
             }
-        }
 
-        if (fcell_managers[0]->refine_me){
-            int idx= i_pix;
-            fcell_managers[0]->increment_image(i_pix, first_deriv_imgs.fcell[idx], second_deriv_imgs.fcell[idx], compute_curvatures);
-        }
+            if (fcell_managers[0]->refine_me){
+                int idx= i_pix;
+                fcell_managers[0]->increment_image(i_pix, first_deriv_imgs.fcell[idx], second_deriv_imgs.fcell[idx], compute_curvatures);
+            }
 
-        if (eta_managers[0]->refine_me){
-            eta_managers[0]->increment_image(i_pix, first_deriv_imgs.eta[i_pix], second_deriv_imgs.eta[i_pix], compute_curvatures);
-            if (modeling_anisotropic_mosaic_spread){
-                if (verbose && i_pix==0)printf("copying aniso eta derivatives\n");
-                for(int i_eta=1; i_eta < 3; i_eta++){
-                    int idx = i_eta*Npix_to_model+i_pix;
-                    eta_managers[i_eta]->increment_image(i_pix, first_deriv_imgs.eta[idx], second_deriv_imgs.eta[idx], compute_curvatures);
+            if (eta_managers[0]->refine_me){
+                eta_managers[0]->increment_image(i_pix, first_deriv_imgs.eta[i_pix], second_deriv_imgs.eta[i_pix], compute_curvatures);
+                if (modeling_anisotropic_mosaic_spread){
+                    if (verbose && i_pix==0)printf("copying aniso eta derivatives\n");
+                    for(int i_eta=1; i_eta < 3; i_eta++){
+                        int idx = i_eta*Npix_to_model+i_pix;
+                        eta_managers[i_eta]->increment_image(i_pix, first_deriv_imgs.eta[idx], second_deriv_imgs.eta[idx], compute_curvatures);
+                    }
                 }
             }
-        }
 
-        for(int i_lam=0; i_lam < 2; i_lam++){
-            if (lambda_managers[i_lam]->refine_me){
-                int idx= Npix_to_model*i_lam + i_pix;
-                lambda_managers[i_lam]->increment_image(i_pix, first_deriv_imgs.lambda[idx], second_deriv_imgs.lambda[idx], compute_curvatures);
+            for(int i_lam=0; i_lam < 2; i_lam++){
+                if (lambda_managers[i_lam]->refine_me){
+                    int idx= Npix_to_model*i_lam + i_pix;
+                    lambda_managers[i_lam]->increment_image(i_pix, first_deriv_imgs.lambda[idx], second_deriv_imgs.lambda[idx], compute_curvatures);
+                }
             }
-        }
 
-        for(int i_pan=0; i_pan <3; i_pan++){
-            int i_rot = pan_rot_ids[i_pan];
-            if (panels[i_rot]->refine_me){
-                int idx = Npix_to_model*i_pan + i_pix;
-                panels[i_rot]->increment_image(i_pix, first_deriv_imgs.panel_rot[idx], second_deriv_imgs.panel_rot[idx], compute_curvatures);
-            }
+            for(int i_pan=0; i_pan <3; i_pan++){
+                int i_rot = pan_rot_ids[i_pan];
+                if (panels[i_rot]->refine_me){
+                    int idx = Npix_to_model*i_pan + i_pix;
+                    panels[i_rot]->increment_image(i_pix, first_deriv_imgs.panel_rot[idx], second_deriv_imgs.panel_rot[idx], compute_curvatures);
+                }
 
-            int i_orig = pan_orig_ids[i_pan];
-            if(panels[i_orig]->refine_me){
-                int idx= Npix_to_model*i_pan + i_pix;
-                panels[i_orig]->increment_image(i_pix, first_deriv_imgs.panel_orig[idx], second_deriv_imgs.panel_orig[idx], compute_curvatures);
+                int i_orig = pan_orig_ids[i_pan];
+                if(panels[i_orig]->refine_me){
+                    int idx= Npix_to_model*i_pan + i_pix;
+                    panels[i_orig]->increment_image(i_pix, first_deriv_imgs.panel_orig[idx], second_deriv_imgs.panel_orig[idx], compute_curvatures);
+                }
             }
-        }
 
-        if (fp_fdp_managers[0]->refine_me)
-            fp_fdp_managers[0]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix], 0, compute_curvatures);
-        if (fp_fdp_managers[1]->refine_me)
-            fp_fdp_managers[1]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix+Npix_to_model], 0, compute_curvatures);
+            if (fp_fdp_managers[0]->refine_me)
+                fp_fdp_managers[0]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix], 0, compute_curvatures);
+            if (fp_fdp_managers[1]->refine_me)
+                fp_fdp_managers[1]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix+Npix_to_model], 0, compute_curvatures);
 
-    } // END of flex array update
+        } // END of flex array update
+    }
 
     delete[] db_steps.subS_pos;
     delete[] db_steps.subF_pos;
@@ -2368,7 +2426,6 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
         TIMERS.timings += 1; // only increment timings at the end of the add_diffBragg_spots call
     }
 
-
     if(verbose) printf("done with pixel loop\n");
 } // END  of add_diffBragg_spots
 
diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h
index 599e55a3d0..882a150870 100644
--- a/simtbx/diffBragg/src/diffBragg.h
+++ b/simtbx/diffBragg/src/diffBragg.h
@@ -237,7 +237,7 @@ class diffBragg: public nanoBragg{
   af::flex_double get_raw_pixels_roi();
   boost::python::tuple get_fp_fdp_derivative_pixels();
   boost::python::tuple get_ncells_derivative_pixels();
-#ifdef DIFFBRAGG_HAVE_KOKKOS
+
   PyObject* PyCapsule_Wrapper(DLManagedTensor* (diffBraggKOKKOS::*func)());
   PyObject* get_floatimage();
   PyObject* get_wavelenimage();
@@ -261,7 +261,7 @@ class diffBragg: public nanoBragg{
   PyObject* get_d2_panel_orig_images();
   PyObject* get_d_fp_fdp_images();
   PyObject* get_Fhkl_scale_deriv();
-#endif
+
   boost::python::tuple get_diffuse_gamma_derivative_pixels();
   boost::python::tuple get_diffuse_sigma_derivative_pixels();
   boost::python::tuple get_ncells_def_derivative_pixels();
@@ -311,6 +311,7 @@ class diffBragg: public nanoBragg{
   bool update_oversample_during_refinement;
   bool oversample_omega;
   bool only_save_omega_kahn;
+  bool host_transfer;
 
   // miller array
   void quick_Fcell_update(boost::python::tuple const& value);
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
index 016f684d05..4656835311 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
@@ -581,56 +581,58 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     gettimeofday(&t1, 0);
     //  COPY BACK FROM DEVICE
     Kokkos::Tools::pushRegion("COPY BACK FROM DEVICE");
-    kokkostbx::transfer_kokkos2vector(floatimage, m_floatimage);
+    if (db_flags.host_transfer) {
+        kokkostbx::transfer_kokkos2vector(floatimage, m_floatimage);
 
-    if (db_flags.wavelength_img) {
-        kokkostbx::transfer_kokkos2vector(d_image.wavelength, m_wavelenimage);
-    }
-    if (db_flags.refine_fcell) {
-        kokkostbx::transfer_kokkos2vector(d_image.fcell, m_d_fcell_images);
-        kokkostbx::transfer_kokkos2vector(d2_image.fcell, m_d2_fcell_images);
-    }
-    if (db_flags.Fhkl_gradient_mode){
-        if (db_flags.Fhkl_errors_mode){
-            kokkostbx::transfer_kokkos2vector(d_image.Fhkl_hessian, m_Fhkl_scale_deriv);
+        if (db_flags.wavelength_img) {
+            kokkostbx::transfer_kokkos2vector(d_image.wavelength, m_wavelenimage);
         }
-        else{
-            kokkostbx::transfer_kokkos2vector(d_image.Fhkl_scale_deriv, m_Fhkl_scale_deriv);
+        if (db_flags.refine_fcell) {
+            kokkostbx::transfer_kokkos2vector(d_image.fcell, m_d_fcell_images);
+            kokkostbx::transfer_kokkos2vector(d2_image.fcell, m_d2_fcell_images);
+        }
+        if (db_flags.Fhkl_gradient_mode){
+            if (db_flags.Fhkl_errors_mode){
+                kokkostbx::transfer_kokkos2vector(d_image.Fhkl_hessian, m_Fhkl_scale_deriv);
+            }
+            else{
+                kokkostbx::transfer_kokkos2vector(d_image.Fhkl_scale_deriv, m_Fhkl_scale_deriv);
+            }
+        }
+        if (std::count(db_flags.refine_Umat.begin(), db_flags.refine_Umat.end(), true) > 0) {
+            kokkostbx::transfer_kokkos2vector(d_image.Umat, m_d_Umat_images);
+            kokkostbx::transfer_kokkos2vector(d2_image.Umat, m_d2_Umat_images);
+        }
+        if (std::count(db_flags.refine_panel_rot.begin(), db_flags.refine_panel_rot.end(), true) > 0) {
+            kokkostbx::transfer_kokkos2vector(d_image.panel_rot, m_d_panel_rot_images);
+        }
+        if (std::count(db_flags.refine_panel_origin.begin(), db_flags.refine_panel_origin.end(), true) >
+            0) {
+            kokkostbx::transfer_kokkos2vector(d_image.panel_orig, m_d_panel_orig_images);
+        }
+        if (db_flags.refine_eta) {
+            kokkostbx::transfer_kokkos2vector(d_image.eta, m_d_eta_images);
+            kokkostbx::transfer_kokkos2vector(d2_image.eta, m_d2_eta_images);
+        }
+        if (std::count(db_flags.refine_Ncells.begin(), db_flags.refine_Ncells.end(), true) > 0 ||
+            db_flags.refine_Ncells_def) {
+            kokkostbx::transfer_kokkos2vector(d_image.Ncells, m_d_Ncells_images);
+            kokkostbx::transfer_kokkos2vector(d2_image.Ncells, m_d2_Ncells_images);
+        }
+        if (db_flags.refine_diffuse) {
+            kokkostbx::transfer_kokkos2vector(d_image.diffuse_gamma, m_d_diffuse_gamma_images);
+            kokkostbx::transfer_kokkos2vector(d_image.diffuse_sigma, m_d_diffuse_sigma_images);
+        }
+        if (std::count(db_flags.refine_Bmat.begin(), db_flags.refine_Bmat.end(), true) > 0) {
+            kokkostbx::transfer_kokkos2vector(d_image.Bmat, m_d_Bmat_images);
+            kokkostbx::transfer_kokkos2vector(d2_image.Bmat, m_d2_Bmat_images);
+        }
+        if (std::count(db_flags.refine_lambda.begin(), db_flags.refine_lambda.end(), true) > 0) {
+            kokkostbx::transfer_kokkos2vector(d_image.lambda, m_d_lambda_images);
+        }
+        if (db_flags.refine_fp_fdp) {
+            kokkostbx::transfer_kokkos2vector(d_image.fp_fdp, m_d_fp_fdp_images);
         }
-    }
-    if (std::count(db_flags.refine_Umat.begin(), db_flags.refine_Umat.end(), true) > 0) {
-        kokkostbx::transfer_kokkos2vector(d_image.Umat, m_d_Umat_images);
-        kokkostbx::transfer_kokkos2vector(d2_image.Umat, m_d2_Umat_images);
-    }
-    if (std::count(db_flags.refine_panel_rot.begin(), db_flags.refine_panel_rot.end(), true) > 0) {
-        kokkostbx::transfer_kokkos2vector(d_image.panel_rot, m_d_panel_rot_images);
-    }
-    if (std::count(db_flags.refine_panel_origin.begin(), db_flags.refine_panel_origin.end(), true) >
-        0) {
-        kokkostbx::transfer_kokkos2vector(d_image.panel_orig, m_d_panel_orig_images);
-    }
-    if (db_flags.refine_eta) {
-        kokkostbx::transfer_kokkos2vector(d_image.eta, m_d_eta_images);
-        kokkostbx::transfer_kokkos2vector(d2_image.eta, m_d2_eta_images);
-    }
-    if (std::count(db_flags.refine_Ncells.begin(), db_flags.refine_Ncells.end(), true) > 0 ||
-        db_flags.refine_Ncells_def) {
-        kokkostbx::transfer_kokkos2vector(d_image.Ncells, m_d_Ncells_images);
-        kokkostbx::transfer_kokkos2vector(d2_image.Ncells, m_d2_Ncells_images);
-    }
-    if (db_flags.refine_diffuse) {
-        kokkostbx::transfer_kokkos2vector(d_image.diffuse_gamma, m_d_diffuse_gamma_images);
-        kokkostbx::transfer_kokkos2vector(d_image.diffuse_sigma, m_d_diffuse_sigma_images);
-    }
-    if (std::count(db_flags.refine_Bmat.begin(), db_flags.refine_Bmat.end(), true) > 0) {
-        kokkostbx::transfer_kokkos2vector(d_image.Bmat, m_d_Bmat_images);
-        kokkostbx::transfer_kokkos2vector(d2_image.Bmat, m_d2_Bmat_images);
-    }
-    if (std::count(db_flags.refine_lambda.begin(), db_flags.refine_lambda.end(), true) > 0) {
-        kokkostbx::transfer_kokkos2vector(d_image.lambda, m_d_lambda_images);
-    }
-    if (db_flags.refine_fp_fdp) {
-        kokkostbx::transfer_kokkos2vector(d_image.fp_fdp, m_d_fp_fdp_images);
     }
 
     Kokkos::Tools::popRegion();
diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp
index dd1fe393b8..feafb15c49 100644
--- a/simtbx/diffBragg/src/diffBragg_ext.cpp
+++ b/simtbx/diffBragg/src/diffBragg_ext.cpp
@@ -507,11 +507,8 @@ namespace boost_python { namespace {
 std::string kokkos_device() {
   std::string backend = "cpu:0";
 #ifdef DIFFBRAGG_HAVE_KOKKOS
-  if (Kokkos::is_finalized()) {
-    throw std::runtime_error("Error: Kokkos has been finalized.\n");
-  }
-  if (!Kokkos::is_initialized()) {
-    throw std::runtime_error("Error: Kokkos not initialized.\n");
+  if (Kokkos::is_finalized() || !Kokkos::is_initialized()) {
+    return backend;
   }
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   backend = "cuda:" + std::to_string( Kokkos::device_id() );
@@ -879,6 +876,11 @@ struct DLPackAPI {
                      make_setter(&simtbx::nanoBragg::diffBragg::oversample_omega,dcp()),
                     "whether to use an average solid angle correction per pixel, or one at the sub pixel level")
 
+      .add_property("host_transfer",
+                     make_getter(&simtbx::nanoBragg::diffBragg::host_transfer,rbv()),
+                     make_setter(&simtbx::nanoBragg::diffBragg::host_transfer,dcp()),
+                    "whether to transfer results from device to host")
+
       .add_property("force_cpu",
                      make_getter(&simtbx::nanoBragg::diffBragg::force_cpu,rbv()),
                      make_setter(&simtbx::nanoBragg::diffBragg::force_cpu,dcp()),
diff --git a/simtbx/diffBragg/src/util.h b/simtbx/diffBragg/src/util.h
index ec197c1dc9..b95ffebd4f 100644
--- a/simtbx/diffBragg/src/util.h
+++ b/simtbx/diffBragg/src/util.h
@@ -10,6 +10,8 @@
 #include <string>
 #include <sys/time.h>
 
+#include "dlpack/dlpack.h"
+
 #ifndef CUDAREAL
     #define CUDAREAL double
 #endif
@@ -29,6 +31,48 @@ inline void easy_time(double& timer, struct timeval& t, bool recording){
         timer += time;
 }
 
+template<typename DataType>
+DLDataTypeCode getDLPackTypeCode() {
+  if (std::is_same<DataType, float>::value) {
+    return kDLFloat;
+  } else if (std::is_same<DataType, double>::value) {
+    return kDLFloat;
+  } else if (std::is_same<DataType, int>::value) {
+    return kDLInt;
+  } else if (std::is_same<DataType, unsigned int>::value) {
+    return kDLUInt;
+  // } else if (std::is_same<DataType, bool>::value) {
+    // return kDLBool;
+  } else {
+    // Unsupported data type
+    throw std::runtime_error("Unsupported data type for DLPack conversion");
+  }
+}
+
+template<typename DataType>
+DLManagedTensor* array_to_dlpack(DataType* pointer, int64_t length) {
+  
+  int64_t* shape = new int64_t[1];
+  shape[0] = length;
+
+  // Create a DLPack tensor
+  DLManagedTensor* dlpackTensor = new DLManagedTensor;
+  dlpackTensor->dl_tensor.data = static_cast<void*>(pointer);
+  dlpackTensor->dl_tensor.device = {kDLCPU, 0};
+  dlpackTensor->dl_tensor.dtype.code = getDLPackTypeCode<DataType>();
+  dlpackTensor->dl_tensor.dtype.bits = sizeof(DataType) * 8;
+  dlpackTensor->dl_tensor.dtype.lanes = 1;  
+  dlpackTensor->dl_tensor.ndim = 1;  
+  dlpackTensor->dl_tensor.shape = shape;
+  dlpackTensor->dl_tensor.strides = nullptr;
+  dlpackTensor->dl_tensor.byte_offset = 0;
+  dlpackTensor->manager_ctx = nullptr;
+  dlpackTensor->deleter = [](DLManagedTensor* tensor) {
+      delete[] tensor->dl_tensor.shape;
+  };
+  return dlpackTensor;
+}
+
 struct timer_variables{
     double add_spots_pre=0; // times the initializations for add spots kernel
     double add_spots_post=0; // times the copies that occur after add spots kernel
@@ -124,6 +168,7 @@ struct flags{
     bool isotropic_ncells = false; // one mosaic domain parameter
     bool complex_miller = false;  // is the miller array complex (such thet Fhkl_linear and Fhkl2_linear are both defined)
     bool no_Nabc_scale = false; // no Nabc prefactor
+    bool host_transfer = true; // transfer data after add_diffbragg_spots
     bool refine_diffuse = false; // flag for computing diffuse gradients
     std::vector<bool> refine_Bmat;  //  Bmatrix
     std::vector<bool> refine_Ncells; // mosaic domain size

From fff52bfc65f488f387b9ccde99d8d1959a19f625 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Fri, 15 Mar 2024 09:31:50 -0700
Subject: [PATCH 15/17] Update hopper_utils_ensemble

---
 simtbx/diffBragg/hopper_ensemble_utils.py | 10 ++++++----
 simtbx/diffBragg/hopper_utils.py          |  3 ++-
 simtbx/diffBragg/src/diffBragg.cpp        |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/simtbx/diffBragg/hopper_ensemble_utils.py b/simtbx/diffBragg/hopper_ensemble_utils.py
index 16976e754a..c3e0d712aa 100644
--- a/simtbx/diffBragg/hopper_ensemble_utils.py
+++ b/simtbx/diffBragg/hopper_ensemble_utils.py
@@ -87,7 +87,7 @@ def __call__(self, x, *args, **kwargs):
             if modelers.SIM.D.record_timings:
                 modelers.SIM.D.show_timings()
 
-        return f
+        return f.item()
 
 
 def target_func(x, modelers):
@@ -152,8 +152,10 @@ def target_func(x, modelers):
         spot_scale_p = shot_modeler.P["G_xtal0"]
         G = spot_scale_p.get_val(x[spot_scale_p.xpos])
         g_fhkl += modelers.SIM.D.add_Fhkl_gradients(
-            shot_modeler.pan_fast_slow, resid, V, shot_modeler.all_trusted,
-            shot_modeler.all_freq, modelers.SIM.num_Fhkl_channels, G)
+            shot_modeler.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), shot_modeler.all_trusted.cpu().numpy(),
+            shot_modeler.all_freq.cpu().numpy(), modelers.SIM.num_Fhkl_channels, G)
+        if not modelers.SIM.D.host_transfer:
+            g_fhkl += torch.from_dlpack(modelers.SIM.D.get_Fhkl_scale_deriv())
 
     # add up target and gradients across all ranks
     f = COMM.bcast(COMM.reduce(f))
@@ -496,7 +498,7 @@ def save_up(self, x, ref_iter=None):
             if i_shot % 100==0:
                 MAIN_LOGGER.info("Getting Fhkl errors for shot %d/%d ... " % (i_shot+1, self.num_modelers))
             Fhkl_scale_hessian += self.SIM.D.add_Fhkl_gradients(
-                mod.pan_fast_slow, resid, V, mod.all_trusted, mod.all_freq,
+                mod.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), mod.all_trusted.cpu().numpy(), mod.all_freq.cpu().numpy(),
                 self.SIM.num_Fhkl_channels, G, track=False, errors=True)
             # ------------
 
diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py
index 5c647f6729..9b6a74eb37 100644
--- a/simtbx/diffBragg/hopper_utils.py
+++ b/simtbx/diffBragg/hopper_utils.py
@@ -2044,6 +2044,8 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores
             G = spot_scale_p.get_val(x[spot_scale_p.xpos])
             fhkl_grad = SIM.D.add_Fhkl_gradients(pfs, resid.cpu().numpy(), V.cpu().numpy(), trusted.cpu().numpy(),
                                                  mod.all_freq.cpu().numpy(), SIM.num_Fhkl_channels, G)
+            if not SIM.D.host_transfer:
+                fhkl_grad = torch.from_dlpack(SIM.D.get_Fhkl_scale_deriv()).cpu().numpy()
 
             if params.betas.Fhkl is not None:
                 for i_chan in range(SIM.num_Fhkl_channels):
@@ -2058,7 +2060,6 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores
 
         gnorm = np.linalg.norm(g)
 
-
     debug_s = "F=%10.7g sigZ=%10.7g (Fracs of F: %s), |g|=%10.7g" \
               % (f, zscore_sigma, restraint_debug_s, gnorm)
 
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 3bd9821444..3499b5f7db 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1666,7 +1666,7 @@ PyObject* diffBragg::get_d_fp_fdp_images() {
 PyObject* diffBragg::get_Fhkl_scale_deriv() {
 #ifdef DIFFBRAGG_HAVE_KOKKOS
     if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
+        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv);
     }
 #endif
     return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fp_fdp.data(), first_deriv_imgs.fp_fdp.size()), "dltensor", dlpack_destructor);    

From 08b2d320ca6f652dc51eeee176a8c1a86e6ce506 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Fri, 15 Mar 2024 14:23:22 -0700
Subject: [PATCH 16/17] add missing get_dlpack functions and refactor

---
 simtbx/diffBragg/src/diffBragg.cpp | 117 ++++++++++-------------------
 simtbx/diffBragg/src/diffBragg.h   |   3 +-
 2 files changed, 42 insertions(+), 78 deletions(-)

diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index 3499b5f7db..c332bf5e92 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1522,154 +1522,117 @@ void dlpack_destructor(PyObject* capsule) {
 
 
 // Fun with pointer-to-member-functions
-PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func)()) {
-    if (diffBragg_runner == nullptr) {
-        return nullptr;
+PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func)(), image_type &vec) {
+#ifdef DIFFBRAGG_HAVE_KOKKOS
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
+        if (diffBragg_runner == nullptr) {
+            return nullptr;
+        }
+        return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);        
     }
-    return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);
+#endif
+    return PyCapsule_New(array_to_dlpack(vec.data(), vec.size()), "dltensor", dlpack_destructor);
 }
 
 PyObject* diffBragg::get_floatimage() {
 #ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_floatimage);
+    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL) {
+        if (diffBragg_runner == nullptr) {
+            return nullptr;
+        }        
+        return PyCapsule_New(diffBragg_runner->get_floatimage(), "dltensor", dlpack_destructor);        
     }
 #endif
     return PyCapsule_New(array_to_dlpack(raw_pixels_roi.begin(), Npix_to_model), "dltensor", dlpack_destructor);
 }
 
 PyObject* diffBragg::get_wavelenimage() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_wavelenimage);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_wavelenimage, first_deriv_imgs.wavelength);
 }
 
 PyObject* diffBragg::get_d_diffuse_gamma_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_gamma_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_gamma_images, first_deriv_imgs.diffuse_gamma);
 }
 
 PyObject* diffBragg::get_d_diffuse_sigma_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_sigma_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_sigma_images, first_deriv_imgs.diffuse_sigma);
 }
 
 PyObject* diffBragg::get_d_Umat_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Umat.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images, first_deriv_imgs.Umat);
 }
 
 PyObject* diffBragg::get_d2_Umat_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Umat_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Umat_images, second_deriv_imgs.Umat);
 }
 
 PyObject* diffBragg::get_d_Bmat_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Bmat.data(), 6*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images, first_deriv_imgs.Bmat);
 }    
 
 PyObject* diffBragg::get_d2_Bmat_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images, second_deriv_imgs.Bmat);
 }
 
 PyObject* diffBragg::get_d_Ncells_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.Ncells.data(), 6*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images, first_deriv_imgs.Ncells);
 }
 
 PyObject* diffBragg::get_d2_Ncells_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Ncells_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Ncells_images, second_deriv_imgs.Ncells);
 }
 
 PyObject* diffBragg::get_d_fcell_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fcell.data(), Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images, first_deriv_imgs.fcell);
 }
 
 PyObject* diffBragg::get_d2_fcell_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_fcell_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_fcell_images, second_deriv_imgs.fcell);
 }
 
 PyObject* diffBragg::get_d_eta_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.eta.data(), first_deriv_imgs.eta.size()), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images, first_deriv_imgs.eta);
 }
 
 PyObject* diffBragg::get_d2_eta_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_eta_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_eta_images, second_deriv_imgs.eta);
 }
 
 PyObject* diffBragg::get_d_lambda_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.lambda.data(), 2*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images, first_deriv_imgs.lambda);
 }
 
 PyObject* diffBragg::get_d2_lambda_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_lambda_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_lambda_images, second_deriv_imgs.lambda);
 }
 
 PyObject* diffBragg::get_d_panel_rot_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.panel_rot.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images, first_deriv_imgs.panel_rot);
 }
 
 PyObject* diffBragg::get_d2_panel_rot_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_rot_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_rot_images, second_deriv_imgs.panel_rot);
 }
 
 PyObject* diffBragg::get_d_panel_orig_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.panel_orig.data(), 3*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images, first_deriv_imgs.panel_orig);
 }
 
 PyObject* diffBragg::get_d2_panel_orig_images() {
-    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_orig_images);
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_orig_images, second_deriv_imgs.panel_orig);
 }
 
 PyObject* diffBragg::get_d_fp_fdp_images() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fp_fdp.data(), 2*Npix_to_model), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images, first_deriv_imgs.fp_fdp);
 }
 
 PyObject* diffBragg::get_Fhkl_scale_deriv() {
-#ifdef DIFFBRAGG_HAVE_KOKKOS
-    if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){
-        return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv);
-    }
-#endif
-    return PyCapsule_New(array_to_dlpack(first_deriv_imgs.fp_fdp.data(), first_deriv_imgs.fp_fdp.size()), "dltensor", dlpack_destructor);    
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv, first_deriv_imgs.Fhkl_scale_deriv);
+}
+
+PyObject* diffBragg::get_Fhkl_hessian() {
+    // Fhkl_scale_deriv is overloaded, depending on Fhkl_errors_mode
+    return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv, first_deriv_imgs.Fhkl_hessian);
 }
 
 
diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h
index 882a150870..501f83b58b 100644
--- a/simtbx/diffBragg/src/diffBragg.h
+++ b/simtbx/diffBragg/src/diffBragg.h
@@ -238,7 +238,7 @@ class diffBragg: public nanoBragg{
   boost::python::tuple get_fp_fdp_derivative_pixels();
   boost::python::tuple get_ncells_derivative_pixels();
 
-  PyObject* PyCapsule_Wrapper(DLManagedTensor* (diffBraggKOKKOS::*func)());
+  PyObject* PyCapsule_Wrapper(DLManagedTensor* (diffBraggKOKKOS::*func)(), image_type &vec);
   PyObject* get_floatimage();
   PyObject* get_wavelenimage();
   PyObject* get_d_diffuse_gamma_images();
@@ -261,6 +261,7 @@ class diffBragg: public nanoBragg{
   PyObject* get_d2_panel_orig_images();
   PyObject* get_d_fp_fdp_images();
   PyObject* get_Fhkl_scale_deriv();
+  PyObject* get_Fhkl_hessian();
 
   boost::python::tuple get_diffuse_gamma_derivative_pixels();
   boost::python::tuple get_diffuse_sigma_derivative_pixels();

From 27560804df40db64290592d5ffdd0567c9b19585 Mon Sep 17 00:00:00 2001
From: Felix Wittwer <fwittwer@lbl.gov>
Date: Fri, 15 Mar 2024 14:25:10 -0700
Subject: [PATCH 17/17] Clean clutter

---
 simtbx/diffBragg/src/diffBragg.cpp | 8 ++++----
 simtbx/diffBragg/src/util.h        | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index c332bf5e92..f4b29476d4 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1528,7 +1528,7 @@ PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func
         if (diffBragg_runner == nullptr) {
             return nullptr;
         }
-        return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);        
+        return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor);
     }
 #endif
     return PyCapsule_New(array_to_dlpack(vec.data(), vec.size()), "dltensor", dlpack_destructor);
@@ -1539,8 +1539,8 @@ PyObject* diffBragg::get_floatimage() {
     if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL) {
         if (diffBragg_runner == nullptr) {
             return nullptr;
-        }        
-        return PyCapsule_New(diffBragg_runner->get_floatimage(), "dltensor", dlpack_destructor);        
+        }
+        return PyCapsule_New(diffBragg_runner->get_floatimage(), "dltensor", dlpack_destructor);
     }
 #endif
     return PyCapsule_New(array_to_dlpack(raw_pixels_roi.begin(), Npix_to_model), "dltensor", dlpack_destructor);
@@ -1568,7 +1568,7 @@ PyObject* diffBragg::get_d2_Umat_images() {
 
 PyObject* diffBragg::get_d_Bmat_images() {
     return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images, first_deriv_imgs.Bmat);
-}    
+}
 
 PyObject* diffBragg::get_d2_Bmat_images() {
     return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images, second_deriv_imgs.Bmat);
diff --git a/simtbx/diffBragg/src/util.h b/simtbx/diffBragg/src/util.h
index b95ffebd4f..5de97ce4fd 100644
--- a/simtbx/diffBragg/src/util.h
+++ b/simtbx/diffBragg/src/util.h
@@ -51,7 +51,7 @@ DLDataTypeCode getDLPackTypeCode() {
 
 template<typename DataType>
 DLManagedTensor* array_to_dlpack(DataType* pointer, int64_t length) {
-  
+
   int64_t* shape = new int64_t[1];
   shape[0] = length;
 
@@ -61,8 +61,8 @@ DLManagedTensor* array_to_dlpack(DataType* pointer, int64_t length) {
   dlpackTensor->dl_tensor.device = {kDLCPU, 0};
   dlpackTensor->dl_tensor.dtype.code = getDLPackTypeCode<DataType>();
   dlpackTensor->dl_tensor.dtype.bits = sizeof(DataType) * 8;
-  dlpackTensor->dl_tensor.dtype.lanes = 1;  
-  dlpackTensor->dl_tensor.ndim = 1;  
+  dlpackTensor->dl_tensor.dtype.lanes = 1;
+  dlpackTensor->dl_tensor.ndim = 1;
   dlpackTensor->dl_tensor.shape = shape;
   dlpackTensor->dl_tensor.strides = nullptr;
   dlpackTensor->dl_tensor.byte_offset = 0;