potentially incorrect transformation? #241

avik-pal · 2025-01-16T22:41:14Z

I will try to reduce this but opening an initial version for now

module {
  func.func private @"*_broadcast_scalar"(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg0 : tensor<f32>
    return %0, %arg0 : tensor<f32>, tensor<f32>
  }
  func.func private @identity_broadcast_scalar(%arg0: tensor<f32>) -> tensor<f32> {
    return %arg0 : tensor<f32>
  }
  func.func private @"*_broadcast_scalar1"(%arg0: tensor<i64>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i64>, tensor<f32>) {
    %0 = stablehlo.convert %arg0 : (tensor<i64>) -> tensor<f32>
    %1 = stablehlo.multiply %0, %arg1 : tensor<f32>
    return %1, %arg0, %arg1 : tensor<f32>, tensor<i64>, tensor<f32>
  }
  func.func private @"+_broadcast_scalar"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func @main(%arg0: tensor<3x12x4xf32>) -> (tensor<3x12x12xf32>, tensor<3x12x4xf32>) {
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<3x12x4xf32>) -> tensor<4x12x3xf32>
    %1:2 = enzyme.batch @"*_broadcast_scalar"(%0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> (tensor<4x12x3xf32>, tensor<4x12x3xf32>)
    %2 = stablehlo.convert %1#0 : tensor<4x12x3xf32>
    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
    %3 = enzyme.batch @identity_broadcast_scalar(%2) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> tensor<4x12x3xf32>
    %4 = stablehlo.convert %3 : tensor<4x12x3xf32>
    %5 = stablehlo.reduce(%4 init: %cst) applies stablehlo.add across dimensions = [0] : (tensor<4x12x3xf32>, tensor<f32>) -> tensor<12x3xf32>
    %6 = stablehlo.transpose %5, dims = [1, 0] : (tensor<12x3xf32>) -> tensor<3x12xf32>
    %7 = stablehlo.reshape %6 : (tensor<3x12xf32>) -> tensor<3x12x1xf32>
    %8 = stablehlo.transpose %7, dims = [2, 1, 0] : (tensor<3x12x1xf32>) -> tensor<1x12x3xf32>
    %9 = stablehlo.transpose %1#1, dims = [1, 0, 2] : (tensor<4x12x3xf32>) -> tensor<12x4x3xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<12x12x3xf32>
    %10 = stablehlo.transpose %9, dims = [2, 0, 1] : (tensor<12x4x3xf32>) -> tensor<3x12x4xf32>
    %11 = stablehlo.transpose %1#1, dims = [2, 0, 1] : (tensor<4x12x3xf32>) -> tensor<3x4x12xf32>
    %12 = stablehlo.convert %10 : tensor<3x12x4xf32>
    %13 = stablehlo.convert %11 : tensor<3x4x12xf32>
    %14 = stablehlo.dot_general %12, %13, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x12x4xf32>, tensor<3x4x12xf32>) -> tensor<3x12x12xf32>
    %15 = stablehlo.transpose %14, dims = [1, 2, 0] : (tensor<3x12x12xf32>) -> tensor<12x12x3xf32>
    %c = stablehlo.constant dense<2> : tensor<12x12x3xi64>
    %16:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %15) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xi64>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xi64>, tensor<12x12x3xf32>)
    %17 = stablehlo.convert %16#0 : tensor<12x12x3xf32>
    %18 = stablehlo.transpose %8, dims = [1, 0, 2] : (tensor<1x12x3xf32>) -> tensor<12x1x3xf32>
    %19 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x12x3xf32>) -> tensor<12x12x3xf32>
    %20:3 = enzyme.batch @"+_broadcast_scalar"(%19, %17) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %21 = stablehlo.convert %20#0 : tensor<12x12x3xf32>
    %22 = stablehlo.transpose %21, dims = [2, 1, 0] : (tensor<12x12x3xf32>) -> tensor<3x12x12xf32>
    %23 = stablehlo.transpose %1#1, dims = [2, 1, 0] : (tensor<4x12x3xf32>) -> tensor<3x12x4xf32>
    return %22, %23 : tensor<3x12x12xf32>, tensor<3x12x4xf32>
  }
}

module {
  func.func private @"*_broadcast_scalar"(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg0 : tensor<f32>
    return %0, %arg0 : tensor<f32>, tensor<f32>
  }
  func.func private @identity_broadcast_scalar(%arg0: tensor<f32>) -> tensor<f32> {
    return %arg0 : tensor<f32>
  }
  func.func private @"*_broadcast_scalar1"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func private @"+_broadcast_scalar"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func @main(%arg0: tensor<3x12x4xf32>) -> (tensor<3x12x12xf32>, tensor<3x12x4xf32>) {
    %cst = stablehlo.constant dense<2.000000e+00> : tensor<12x12x3xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<3x12x4xf32>) -> tensor<4x12x3xf32>
    %1:2 = enzyme.batch @"*_broadcast_scalar"(%0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> (tensor<4x12x3xf32>, tensor<4x12x3xf32>)
    %2 = enzyme.batch @identity_broadcast_scalar(%1#0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> tensor<4x12x3xf32>
    %3 = stablehlo.reduce(%2 init: %cst_0) applies stablehlo.add across dimensions = [0] : (tensor<4x12x3xf32>, tensor<f32>) -> tensor<12x3xf32>
    %4 = stablehlo.transpose %3, dims = [1, 0] : (tensor<12x3xf32>) -> tensor<3x12xf32>
    %5 = stablehlo.reshape %4 : (tensor<3x12xf32>) -> tensor<3x12x1xf32>
    %6 = stablehlo.transpose %5, dims = [2, 1, 0] : (tensor<3x12x1xf32>) -> tensor<1x12x3xf32>
    %7 = stablehlo.dot_general %1#1, %1#1, batching_dims = [2] x [2], contracting_dims = [0] x [0] : (tensor<4x12x3xf32>, tensor<4x12x3xf32>) -> tensor<3x12x12xf32>
    %8 = stablehlo.transpose %7, dims = [1, 2, 0] : (tensor<3x12x12xf32>) -> tensor<12x12x3xf32>
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%cst, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %10 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2] : (tensor<1x12x3xf32>) -> tensor<12x12x3xf32>
    %11:3 = enzyme.batch @"+_broadcast_scalar"(%10, %9#0) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %12 = stablehlo.transpose %11#0, dims = [2, 1, 0] : (tensor<12x12x3xf32>) -> tensor<3x12x12xf32>
    %13 = stablehlo.transpose %1#1, dims = [2, 1, 0] : (tensor<4x12x3xf32>) -> tensor<3x12x4xf32>
    return %12, %13 : tensor<3x12x12xf32>, tensor<3x12x4xf32>
  }
}

running the batch pass

envs/crash.mlir:10:10: error: Mismatched dimension sizes 12 and 3 in dimension 0
    %1 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
         ^
envs/crash.mlir:29:12: note: called from
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
           ^
envs/crash.mlir:10:10: remark: location of op
    %1 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
         ^
envs/crash.mlir:29:12: note: called from
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
           ^
LLVM ERROR: Failed to infer result type(s):
"stablehlo.multiply"(...) {} : (tensor<12x3x12xf32>, tensor<3x12x12xf32>) -> ( ??? )

avik-pal · 2025-01-16T22:52:40Z

module {
  func.func @main(%arg0: tensor<2x12x4xf32>) -> (tensor<2x12x12xf32>, tensor<2x12x4xf32>) {
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x12x4xf32>) -> tensor<4x12x2xf32>
    %cst = stablehlo.constant dense<2.000000e+00> : tensor<12x12x2xf32>
    %1 = stablehlo.transpose %0, dims = [1, 0, 2] : (tensor<4x12x2xf32>) -> tensor<12x4x2xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<12x12x2xf32>
    %2 = stablehlo.transpose %1, dims = [2, 0, 1] : (tensor<12x4x2xf32>) -> tensor<2x12x4xf32>
    %3 = stablehlo.transpose %0, dims = [2, 0, 1] : (tensor<4x12x2xf32>) -> tensor<2x4x12xf32>
    %4 = stablehlo.convert %2 : tensor<2x12x4xf32>
    %5 = stablehlo.convert %3 : tensor<2x4x12xf32>
    %6 = stablehlo.dot_general %4, %5, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x12x4xf32>, tensor<2x4x12xf32>) -> tensor<2x12x12xf32>
    %7 = stablehlo.transpose %6, dims = [1, 2, 0] : (tensor<2x12x12xf32>) -> tensor<12x12x2xf32>
    %8 = stablehlo.multiply %cst, %7 : tensor<12x12x2xf32>
    %9 = stablehlo.transpose %8, dims = [2, 1, 0] : (tensor<12x12x2xf32>) -> tensor<2x12x12xf32>
    %10 = stablehlo.transpose %0, dims = [2, 1, 0] : (tensor<4x12x2xf32>) -> tensor<2x12x4xf32>
    return %9, %10 : tensor<2x12x12xf32>, tensor<2x12x4xf32>
  }
}

This is enough to cause the crash

wsmoses · 2025-01-17T19:49:41Z

I just updated per @jumerckx PR for batching. Does that resolve? .... actually no I suppose not since the latter case doesn't have a call.

I think the issue here is that dot_general needs a custom batch interface impl like transpose has

avik-pal · 2025-01-17T19:54:17Z

This failure seems to be very specific to dot_general followed by a multiply. If I replace the 2 * (x * x) with a y = x * x; y + y (i.e. replace the multiply with an add) it no longer crashes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

potentially incorrect transformation? #241

potentially incorrect transformation? #241

avik-pal commented Jan 16, 2025

avik-pal commented Jan 16, 2025

wsmoses commented Jan 17, 2025

avik-pal commented Jan 17, 2025

potentially incorrect transformation? #241

potentially incorrect transformation? #241

Comments

avik-pal commented Jan 16, 2025

avik-pal commented Jan 16, 2025

wsmoses commented Jan 17, 2025

avik-pal commented Jan 17, 2025