Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

potentially incorrect transformation? #241

Open
avik-pal opened this issue Jan 16, 2025 · 3 comments
Open

potentially incorrect transformation? #241

avik-pal opened this issue Jan 16, 2025 · 3 comments

Comments

@avik-pal
Copy link
Collaborator

I will try to reduce this but opening an initial version for now

module {
  func.func private @"*_broadcast_scalar"(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg0 : tensor<f32>
    return %0, %arg0 : tensor<f32>, tensor<f32>
  }
  func.func private @identity_broadcast_scalar(%arg0: tensor<f32>) -> tensor<f32> {
    return %arg0 : tensor<f32>
  }
  func.func private @"*_broadcast_scalar1"(%arg0: tensor<i64>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i64>, tensor<f32>) {
    %0 = stablehlo.convert %arg0 : (tensor<i64>) -> tensor<f32>
    %1 = stablehlo.multiply %0, %arg1 : tensor<f32>
    return %1, %arg0, %arg1 : tensor<f32>, tensor<i64>, tensor<f32>
  }
  func.func private @"+_broadcast_scalar"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func @main(%arg0: tensor<3x12x4xf32>) -> (tensor<3x12x12xf32>, tensor<3x12x4xf32>) {
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<3x12x4xf32>) -> tensor<4x12x3xf32>
    %1:2 = enzyme.batch @"*_broadcast_scalar"(%0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> (tensor<4x12x3xf32>, tensor<4x12x3xf32>)
    %2 = stablehlo.convert %1#0 : tensor<4x12x3xf32>
    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
    %3 = enzyme.batch @identity_broadcast_scalar(%2) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> tensor<4x12x3xf32>
    %4 = stablehlo.convert %3 : tensor<4x12x3xf32>
    %5 = stablehlo.reduce(%4 init: %cst) applies stablehlo.add across dimensions = [0] : (tensor<4x12x3xf32>, tensor<f32>) -> tensor<12x3xf32>
    %6 = stablehlo.transpose %5, dims = [1, 0] : (tensor<12x3xf32>) -> tensor<3x12xf32>
    %7 = stablehlo.reshape %6 : (tensor<3x12xf32>) -> tensor<3x12x1xf32>
    %8 = stablehlo.transpose %7, dims = [2, 1, 0] : (tensor<3x12x1xf32>) -> tensor<1x12x3xf32>
    %9 = stablehlo.transpose %1#1, dims = [1, 0, 2] : (tensor<4x12x3xf32>) -> tensor<12x4x3xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<12x12x3xf32>
    %10 = stablehlo.transpose %9, dims = [2, 0, 1] : (tensor<12x4x3xf32>) -> tensor<3x12x4xf32>
    %11 = stablehlo.transpose %1#1, dims = [2, 0, 1] : (tensor<4x12x3xf32>) -> tensor<3x4x12xf32>
    %12 = stablehlo.convert %10 : tensor<3x12x4xf32>
    %13 = stablehlo.convert %11 : tensor<3x4x12xf32>
    %14 = stablehlo.dot_general %12, %13, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x12x4xf32>, tensor<3x4x12xf32>) -> tensor<3x12x12xf32>
    %15 = stablehlo.transpose %14, dims = [1, 2, 0] : (tensor<3x12x12xf32>) -> tensor<12x12x3xf32>
    %c = stablehlo.constant dense<2> : tensor<12x12x3xi64>
    %16:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %15) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xi64>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xi64>, tensor<12x12x3xf32>)
    %17 = stablehlo.convert %16#0 : tensor<12x12x3xf32>
    %18 = stablehlo.transpose %8, dims = [1, 0, 2] : (tensor<1x12x3xf32>) -> tensor<12x1x3xf32>
    %19 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x12x3xf32>) -> tensor<12x12x3xf32>
    %20:3 = enzyme.batch @"+_broadcast_scalar"(%19, %17) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %21 = stablehlo.convert %20#0 : tensor<12x12x3xf32>
    %22 = stablehlo.transpose %21, dims = [2, 1, 0] : (tensor<12x12x3xf32>) -> tensor<3x12x12xf32>
    %23 = stablehlo.transpose %1#1, dims = [2, 1, 0] : (tensor<4x12x3xf32>) -> tensor<3x12x4xf32>
    return %22, %23 : tensor<3x12x12xf32>, tensor<3x12x4xf32>
  }
}
module {
  func.func private @"*_broadcast_scalar"(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg0 : tensor<f32>
    return %0, %arg0 : tensor<f32>, tensor<f32>
  }
  func.func private @identity_broadcast_scalar(%arg0: tensor<f32>) -> tensor<f32> {
    return %arg0 : tensor<f32>
  }
  func.func private @"*_broadcast_scalar1"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func private @"+_broadcast_scalar"(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) {
    %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
    return %0, %arg0, %arg1 : tensor<f32>, tensor<f32>, tensor<f32>
  }
  func.func @main(%arg0: tensor<3x12x4xf32>) -> (tensor<3x12x12xf32>, tensor<3x12x4xf32>) {
    %cst = stablehlo.constant dense<2.000000e+00> : tensor<12x12x3xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<3x12x4xf32>) -> tensor<4x12x3xf32>
    %1:2 = enzyme.batch @"*_broadcast_scalar"(%0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> (tensor<4x12x3xf32>, tensor<4x12x3xf32>)
    %2 = enzyme.batch @identity_broadcast_scalar(%1#0) {batch_shape = array<i64: 4, 12, 3>} : (tensor<4x12x3xf32>) -> tensor<4x12x3xf32>
    %3 = stablehlo.reduce(%2 init: %cst_0) applies stablehlo.add across dimensions = [0] : (tensor<4x12x3xf32>, tensor<f32>) -> tensor<12x3xf32>
    %4 = stablehlo.transpose %3, dims = [1, 0] : (tensor<12x3xf32>) -> tensor<3x12xf32>
    %5 = stablehlo.reshape %4 : (tensor<3x12xf32>) -> tensor<3x12x1xf32>
    %6 = stablehlo.transpose %5, dims = [2, 1, 0] : (tensor<3x12x1xf32>) -> tensor<1x12x3xf32>
    %7 = stablehlo.dot_general %1#1, %1#1, batching_dims = [2] x [2], contracting_dims = [0] x [0] : (tensor<4x12x3xf32>, tensor<4x12x3xf32>) -> tensor<3x12x12xf32>
    %8 = stablehlo.transpose %7, dims = [1, 2, 0] : (tensor<3x12x12xf32>) -> tensor<12x12x3xf32>
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%cst, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %10 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2] : (tensor<1x12x3xf32>) -> tensor<12x12x3xf32>
    %11:3 = enzyme.batch @"+_broadcast_scalar"(%10, %9#0) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
    %12 = stablehlo.transpose %11#0, dims = [2, 1, 0] : (tensor<12x12x3xf32>) -> tensor<3x12x12xf32>
    %13 = stablehlo.transpose %1#1, dims = [2, 1, 0] : (tensor<4x12x3xf32>) -> tensor<3x12x4xf32>
    return %12, %13 : tensor<3x12x12xf32>, tensor<3x12x4xf32>
  }
}

running the batch pass

envs/crash.mlir:10:10: error: Mismatched dimension sizes 12 and 3 in dimension 0
    %1 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
         ^
envs/crash.mlir:29:12: note: called from
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
           ^
envs/crash.mlir:10:10: remark: location of op
    %1 = stablehlo.multiply %arg0, %arg1 : tensor<f32>
         ^
envs/crash.mlir:29:12: note: called from
    %9:3 = enzyme.batch @"*_broadcast_scalar1"(%c, %8) {batch_shape = array<i64: 12, 12, 3>} : (tensor<12x12x3xf32>, tensor<12x12x3xf32>) -> (tensor<12x12x3xf32>, tensor<12x12x3xf32>, tensor<12x12x3xf32>)
           ^
LLVM ERROR: Failed to infer result type(s):
"stablehlo.multiply"(...) {} : (tensor<12x3x12xf32>, tensor<3x12x12xf32>) -> ( ??? )
@avik-pal
Copy link
Collaborator Author

module {
  func.func @main(%arg0: tensor<2x12x4xf32>) -> (tensor<2x12x12xf32>, tensor<2x12x4xf32>) {
    %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x12x4xf32>) -> tensor<4x12x2xf32>
    %cst = stablehlo.constant dense<2.000000e+00> : tensor<12x12x2xf32>
    %1 = stablehlo.transpose %0, dims = [1, 0, 2] : (tensor<4x12x2xf32>) -> tensor<12x4x2xf32>
    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<12x12x2xf32>
    %2 = stablehlo.transpose %1, dims = [2, 0, 1] : (tensor<12x4x2xf32>) -> tensor<2x12x4xf32>
    %3 = stablehlo.transpose %0, dims = [2, 0, 1] : (tensor<4x12x2xf32>) -> tensor<2x4x12xf32>
    %4 = stablehlo.convert %2 : tensor<2x12x4xf32>
    %5 = stablehlo.convert %3 : tensor<2x4x12xf32>
    %6 = stablehlo.dot_general %4, %5, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x12x4xf32>, tensor<2x4x12xf32>) -> tensor<2x12x12xf32>
    %7 = stablehlo.transpose %6, dims = [1, 2, 0] : (tensor<2x12x12xf32>) -> tensor<12x12x2xf32>
    %8 = stablehlo.multiply %cst, %7 : tensor<12x12x2xf32>
    %9 = stablehlo.transpose %8, dims = [2, 1, 0] : (tensor<12x12x2xf32>) -> tensor<2x12x12xf32>
    %10 = stablehlo.transpose %0, dims = [2, 1, 0] : (tensor<4x12x2xf32>) -> tensor<2x12x4xf32>
    return %9, %10 : tensor<2x12x12xf32>, tensor<2x12x4xf32>
  }
}

This is enough to cause the crash

@wsmoses
Copy link
Member

wsmoses commented Jan 17, 2025

I just updated per @jumerckx PR for batching. Does that resolve? .... actually no I suppose not since the latter case doesn't have a call.

I think the issue here is that dot_general needs a custom batch interface impl like transpose has

@avik-pal
Copy link
Collaborator Author

This failure seems to be very specific to dot_general followed by a multiply. If I replace the 2 * (x * x) with a y = x * x; y + y (i.e. replace the multiply with an add) it no longer crashes

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants