Merge pull request #2 from spiceai/jack/spiceai

Update from `EricLBuehler/main`
spiceai · Aug 1, 2024 · 30a1278 · 30a1278
2 parents 41aaac4 + 2064fb0
commit 30a1278
Show file tree

Hide file tree

Showing 122 changed files with 6,457 additions and 1,147 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,10 @@ target/
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 
+# editor config
+.helix
+.vscode
+
 # These are backup files generated by rustfmt
 **/*.rs.bk
 

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -8,5 +8,5 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    //"rust-analyzer.cargo.features": ["cuda"],
+    "rust-analyzer.cargo.features": ["cuda"],
 }
diff --git a/Cargo.toml b/Cargo.toml
@@ -43,13 +43,13 @@ candle-onnx = { path = "./candle-onnx", version = "0.6.0" }
 candle-transformers = { path = "./candle-transformers", version = "0.6.0" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.11.5", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
+cudarc = { version = "=0.11.6", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 hound = "3.5.1"
-image = { version = "0.25.0", default-features = false, features = ["jpeg", "png"] }
+image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }

diff --git a/README.md b/README.md
@@ -238,7 +238,7 @@ If you have an addition to this list, please submit a pull request.
         - MetaVoice-1B, text-to-speech model.
     - Computer Vision Models.
         - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
-          ConvNeXTv2, MobileOne, EfficientVit (MSRA).
+          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4.
         - yolo-v3, yolo-v8.
         - Segment-Anything Model (SAM).
         - SegFormer.

diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -48,3 +48,7 @@ metal = ["dep:metal", "dep:candle-metal-kernels"]
 [[bench]]
 name = "bench_main"
 harness = false
+
+[[example]]
+name = "metal_basics"
+required-features = ["metal"]
diff --git a/candle-core/benches/benchmarks/affine.rs b/candle-core/benches/benchmarks/affine.rs
@@ -12,7 +12,7 @@ fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name:
     let m = 1024;
     let k = 1024;
 
-    let tensor = Tensor::zeros((b, m, k), dtype, &device).unwrap();
+    let tensor = Tensor::zeros((b, m, k), dtype, device).unwrap();
 
     let flops = b * m * k * dtype.size_in_bytes();
 

diff --git a/candle-core/benches/benchmarks/qmatmul.rs b/candle-core/benches/benchmarks/qmatmul.rs
@@ -7,7 +7,7 @@ use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 
 fn run(matmul: &QMatMul, x: &Tensor) {
-    matmul.forward(&x).unwrap();
+    matmul.forward(x).unwrap();
 }
 
 fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
@@ -50,7 +50,7 @@ fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
 fn criterion_benchmark(c: &mut Criterion) {
     let handler = BenchDeviceHandler::new().unwrap();
     for device in handler.devices {
-        for dtype in vec![
+        for dtype in [
             GgmlDType::F32,
             GgmlDType::F16,
             GgmlDType::Q4_0,

diff --git a/candle-core/benches/benchmarks/unary.rs b/candle-core/benches/benchmarks/unary.rs
@@ -12,7 +12,7 @@ fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &
     let m = 1024;
     let k = 1024;
 
-    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, &device)
+    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, device)
         .unwrap()
         .to_dtype(dtype)
         .unwrap()

diff --git a/candle-core/benches/benchmarks/where_cond.rs b/candle-core/benches/benchmarks/where_cond.rs
@@ -25,9 +25,9 @@ const SIZE: usize = B * M * K;
 const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
 
 fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), &device).unwrap();
-    let on_true = Tensor::ones((B, M, K), dtype, &device).unwrap();
-    let on_false = Tensor::zeros((B, M, K), dtype, &device).unwrap();
+    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), device).unwrap();
+    let on_true = Tensor::ones((B, M, K), dtype, device).unwrap();
+    let on_false = Tensor::zeros((B, M, K), dtype, device).unwrap();
 
     let elements = B * M * K;
     // E.g. 2 f32 tensors + 1 u8 tensor

diff --git a/candle-core/examples/metal_basics.rs b/candle-core/examples/metal_basics.rs
@@ -0,0 +1,28 @@
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+use anyhow::Result;
+use candle_core::{Device, Tensor};
+
+fn main() -> Result<()> {
+    // This requires the code to be run with MTL_CAPTURE_ENABLED=1
+    let device = Device::new_metal(0)?;
+    let metal_device = match &device {
+        Device::Metal(m) => m,
+        _ => anyhow::bail!("unexpected device"),
+    };
+    metal_device.capture("/tmp/candle.gputrace")?;
+    // This first synchronize ensures that a new command buffer gets created after setting up the
+    // capture scope.
+    device.synchronize()?;
+    let x = Tensor::randn(0f32, 1.0, (128, 128), &device)?;
+    let x1 = x.add(&x)?;
+    println!("{x1:?}");
+    // This second synchronize ensures that the command buffer gets commited before the end of the
+    // capture scope.
+    device.synchronize()?;
+    Ok(())
+}
diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs
@@ -320,13 +320,13 @@ impl Tensor {
                         dilation,
                         output_padding: _output_padding,
                     } => {
-                        let grad_arg = grad.conv2d(kernel, *padding, *dilation, *stride, 1)?;
+                        let grad_arg = grad.conv2d(kernel, *padding, *stride, *dilation, 1)?;
                         let sum_grad = grads.or_insert(arg)?;
                         *sum_grad = sum_grad.add(&grad_arg)?;
 
                         let grad_kernel = grad
                             .transpose(0, 1)?
-                            .conv2d(&arg.transpose(0, 1)?, *padding, *stride, *dilation, 1)?
+                            .conv2d(&arg.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                             .transpose(0, 1)?;
                         let sum_grad = grads.or_insert(kernel)?;
                         let (_, _, k0, k1) = kernel.dims4()?;
@@ -634,7 +634,8 @@ impl Tensor {
                         let zeros = arg.zeros_like()?;
                         let positive_mask = arg.gt(&zeros)?.to_dtype(arg.dtype())?;
                         let negative_mask = arg.le(&zeros)?.to_dtype(arg.dtype())?;
-                        let negative_exp_mask = ((negative_mask * arg.exp())? * *alpha)?;
+                        // node == alpha * (e^x - 1) for x <= 0, reuse it
+                        let negative_exp_mask = (negative_mask * (*node + *alpha))?;
                         let combined_mask = (positive_mask + negative_exp_mask)?;
                         *sum_grad = sum_grad.add(&(grad * combined_mask)?)?
                     }

diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs
@@ -47,6 +47,10 @@ impl std::ops::Deref for CudaDevice {
 }
 
 impl CudaDevice {
+    pub fn cublas_handle(&self) -> &cudarc::cublas::CudaBlas {
+        &*self.blas
+    }
+
     pub fn cuda_device(&self) -> Arc<cudarc::driver::CudaDevice> {
         self.device.clone()
     }

diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs
@@ -273,7 +273,13 @@ impl MetalDevice {
         let descriptor = metal::CaptureDescriptor::new();
         descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument);
         descriptor.set_capture_device(self);
-        descriptor.set_output_url(path);
+        // The [set_output_url] call requires an absolute path so we convert it if needed.
+        if path.as_ref().is_absolute() {
+            descriptor.set_output_url(path);
+        } else {
+            let path = std::env::current_dir()?.join(path);
+            descriptor.set_output_url(path);
+        }
 
         capture
             .start_capture(&descriptor)

diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs
@@ -651,9 +651,9 @@ impl Tensor {
     ///
     /// * `args` - A slice of 1D tensors.
     /// * `xy_indexing` - Whether to use xy indexing or ij indexing. If xy is selected, the
-    /// first dimension corresponds to the cardinality of the second input and the second
-    /// dimension corresponds to the cardinality of the first input. If ij is selected, the
-    /// dimensions are in the same order as the cardinality of the inputs.
+    ///   first dimension corresponds to the cardinality of the second input and the second
+    ///   dimension corresponds to the cardinality of the first input. If ij is selected, the
+    ///   dimensions are in the same order as the cardinality of the inputs.
     ///
     /// # Examples
     ///

diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs
@@ -730,6 +730,103 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
             ]
         ]
     );
+
+    // Test the same, but then with the following properties, t & w are unmodified.
+    let padding = 1;
+    let outpadding = 1;
+    let dilation = 1;
+    let stride = 2;
+
+    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
+    let loss = res.sqr()?.sum_all()?;
+    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 3627.0); // torch gives 3626.8560
+
+    let grads = loss.backward()?;
+
+    let grad_t = grads.get(&t).unwrap();
+    let grad_w = grads.get(&w).unwrap();
+    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
+    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
+
+    #[rustfmt::skip]
+    assert_eq!(
+        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
+        [
+            [
+                [  13.2,  -40.7,   -9.7,  -47.3,  -82.7],
+                [ -98.2,    9.7,   57.7,   -6.2,  180.7],
+                [ 100.2,   24.1,    3.7, -100.5,  -48.1],
+                [  -0.3,   13.5,   -2.9,   80.0,  -49.8],
+                [  47.2,  -25.6,  -74.4,   61.2,  -18.4],
+                [   4.6,  -69.5,   27.9,   66.5,  -88.1],
+                 // 4th column on next row; torch is 4.2
+                [ -12.0,   79.2,  -40.0,    4.1,  -97.1],
+            ],
+            [
+                [ -42.2,  -36.5,  -51.1,    7.5,   32.3],
+                [  74.1,  -44.6,  -68.8,   19.5,    7.7],
+                [ 137.1,   54.2,  153.8,  -58.0,   45.5],
+                [  24.4,  -56.8,    9.7,  -41.0,  -14.5],
+                [  -3.7,   72.6,    8.3,  134.8,   40.5],
+                [  43.2,  -56.9,  -47.5,  -89.4,  -95.4],
+                [  68.2,  108.1,  -80.0,   57.0, -121.1]
+            ],
+            [
+                [  31.1,  -11.4,  -34.8,   33.1,  -44.2],
+                [  29.4,  -31.6,  -40.2,   13.7,   13.1],
+                [  -0.8,  -83.8,   -7.8,  -17.3,   78.2],
+                [  12.0, -118.7,  137.5,  -76.7,   50.8],
+                [ -28.7, -114.2,   -3.7,  -96.3,  -13.8],
+                [ -31.8,   28.5,  -14.3,    4.6,   13.4],
+                [  28.0,   -0.2,  -38.9,  -29.7,  -59.0]
+            ],
+            [
+                [ -16.8,   38.5,   15.5,   26.6,   48.9],
+                [  14.5,   49.6,  -24.8,   65.6,   61.7],
+                [  22.1,  -64.7,   -4.3,  -51.0,   36.3],
+                [  31.0,  -88.9,   47.1, -123.5,   -3.8],
+                [ -14.8,  -39.8,  128.2, -110.3,   42.6],
+                // 1st column on next row; torch is -7.2
+                [  -7.1,   95.3,  -21.3,  -58.7,  -13.9], 
+                [  26.9,   21.3,   16.1,   70.3,   32.1]
+            ]
+        ]
+    );
+
+    #[rustfmt::skip]
+    assert_eq!(
+        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
+        [
+            // 2nd value; torch gets -3.2, 3rd value; torch gets 221.8
+           -2.460e+01, -3.100e+00,  2.219e+02,  7.400e+00,  5.620e+01,
+            7.420e+01,  7.830e+01,  8.900e+00,  1.050e+01,  2.810e+01,
+            5.100e+00, -1.046e+02, -1.572e+02,  8.710e+01, -9.840e+01,
+           -4.230e+01, -1.898e+02,  1.860e+01, -3.570e+01,  9.810e+01,
+            4.680e+01,  1.182e+02,  4.020e+01, -1.900e+00,  1.508e+02,
+            1.094e+02,  1.018e+02, -4.620e+01,  1.591e+02, -2.320e+01,
+            // 5th value; torch gets 7.1
+           -8.450e+01, -4.600e+00,  6.330e+01,  1.123e+02, -7.000e+00,
+            1.101e+02, -6.620e+01,  2.090e+01, -5.120e+01,  8.990e+01,
+            9.050e+01, -6.990e+01,  6.800e+01, -9.250e+01,  1.380e+02,
+            4.720e+01,  4.710e+01,  6.210e+01,  8.870e+01,  2.098e+02,
+            3.870e+01, -1.390e+01,  6.270e+01,  1.484e+02, -9.920e+01,
+           -4.200e+01, -1.505e+02, -1.480e+01, -2.620e+01,  8.220e+01,
+           -3.350e+01, -2.260e+01, -1.198e+02, -5.080e+01,  1.259e+02,
+            5.600e+01,  9.270e+01,  1.209e+02,  6.590e+01, -8.330e+01,
+            7.000e+00, -2.600e+01, -1.133e+02,  3.870e+01,  4.020e+01,
+           -6.300e+00, -8.710e+01, -5.150e+01, -8.510e+01,  2.000e-01,
+            3.640e+01, -6.100e+00,  6.590e+01, -2.700e+00,  6.550e+01,
+            // 4th value; torch gets 3.8
+            5.300e+00, -6.760e+01, -4.270e+01, -3.900e+00,  2.880e+01,
+            5.260e+01,  6.170e+01, -1.203e+02, -1.610e+01,  7.740e+01,
+           -1.008e+02, -1.070e+01, -9.900e+00,  3.300e+00, -2.620e+01,
+           -4.440e+01,  2.580e+01, -6.920e+01, -4.220e+01,  1.108e+02,
+            1.240e+01, -3.440e+01, -2.800e+00,  7.880e+01, -6.690e+01,
+            1.480e+01,  2.310e+01, -4.260e+01, -1.500e+00, -4.760e+01,
+            5.350e+01, -2.260e+01,  8.000e-01, -3.840e+01, -2.500e+00
+        ]
+    );
+
     Ok(())
 }
 

diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
@@ -35,7 +35,7 @@ serde = { workspace = true }
 serde_json = { workspace = true }
 symphonia = { version = "0.5.3", features = ["all"], optional = true }
 tokenizers = { workspace = true, features = ["onig"] }
-cpal= { version = "0.15.2", optional = true }
+cpal = { version = "0.15.2", optional = true }
 
 [dev-dependencies]
 anyhow = { workspace = true }

diff --git a/candle-examples/examples/beit/README.md b/candle-examples/examples/beit/README.md
@@ -0,0 +1,20 @@
+# candle-beit
+
+[Beit](https://arxiv.org/abs/2106.08254) is a computer vision model.
+In this example, it is used as an ImageNet classifier: the model returns the
+probability for the image to belong to each of the 1000 ImageNet categories.
+
+## Running some example
+
+```bash
+cargo run --example beit --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+> mountain bike, all-terrain bike, off-roader: 56.16%
+> bicycle-built-for-two, tandem bicycle, tandem: 3.08%
+> maillot                 : 2.23%
+> alp                     : 0.88%
+> crash helmet            : 0.85%
+
+```
+
+![Leading group, Giro d'Italia 2021](../yolo-v8/assets/bike.jpg)