Comments (2)
MLIR Difference
Reproducible
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked7 = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#loc = loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0)
#loc1 = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @kernel(%arg0: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg1: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg2: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg3: !tt.ptr<f32> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg4: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg5: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg6: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg7: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg8: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg9: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0)) attributes {noinline = false} {
%c32768_i32 = arith.constant 32768 : i32 loc(#loc1)
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
%c3_i32 = arith.constant 3 : i32 loc(#loc1)
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
%cst = arith.constant dense<128> : tensor<256x1xi32, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #blocked1> loc(#loc1)
%cst_1 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> loc(#loc1)
%cst_2 = arith.constant dense<64> : tensor<256x64xi32, #blocked2> loc(#loc1)
%cst_3 = arith.constant dense<128> : tensor<128xi32, #blocked3> loc(#loc1)
%cst_4 = arith.constant dense<256> : tensor<256xi32, #blocked3> loc(#loc1)
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%0 = tt.get_program_id x : i32 loc(#loc2)
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked3> loc(#loc3)
%2 = arith.addi %1, %cst_4 : tensor<256xi32, #blocked3> loc(#loc4)
%3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked3> loc(#loc5)
%4 = arith.addi %3, %cst_3 : tensor<128xi32, #blocked3> loc(#loc6)
%5 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked3> loc(#loc7)
%6 = triton_gpu.convert_layout %2 : tensor<256xi32, #blocked3> -> tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc8)
%7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi32, #blocked4> loc(#loc8)
%8 = triton_gpu.convert_layout %7 : tensor<256x1xi32, #blocked4> -> tensor<256x1xi32, #blocked> loc(#loc9)
%9 = tt.splat %arg7 : i32 -> tensor<256x1xi32, #blocked> loc(#loc9)
%10 = arith.muli %8, %9 : tensor<256x1xi32, #blocked> loc(#loc9)
%11 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<256x1x!tt.ptr<f16>, #blocked> loc(#loc10)
%12 = tt.addptr %11, %10 : tensor<256x1x!tt.ptr<f16>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc10)
%13 = triton_gpu.convert_layout %5 : tensor<64xi32, #blocked3> -> tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc11)
%14 = tt.expand_dims %13 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x64xi32, #blocked5> loc(#loc11)
%15 = triton_gpu.convert_layout %14 : tensor<1x64xi32, #blocked5> -> tensor<1x64xi32, #blocked2> loc(#loc12)
%16 = tt.broadcast %12 : tensor<256x1x!tt.ptr<f16>, #blocked> -> tensor<256x64x!tt.ptr<f16>, #blocked> loc(#loc12)
%17 = triton_gpu.convert_layout %16 : tensor<256x64x!tt.ptr<f16>, #blocked> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc12)
%18 = tt.broadcast %15 : tensor<1x64xi32, #blocked2> -> tensor<256x64xi32, #blocked2> loc(#loc12)
%19 = tt.addptr %17, %18 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc12)
%20 = tt.addptr %19, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc13)
%21 = triton_gpu.convert_layout %5 : tensor<64xi32, #blocked3> -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc14)
%22 = tt.expand_dims %21 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1xi32, #blocked4> loc(#loc14)
%23 = triton_gpu.convert_layout %22 : tensor<64x1xi32, #blocked4> -> tensor<64x1xi32, #blocked> loc(#loc15)
%24 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>, #blocked> loc(#loc15)
%25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc15)
%26 = triton_gpu.convert_layout %4 : tensor<128xi32, #blocked3> -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc16)
%27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi32, #blocked5> loc(#loc16)
%28 = triton_gpu.convert_layout %27 : tensor<1x128xi32, #blocked5> -> tensor<1x128xi32, #blocked1> loc(#loc17)
%29 = tt.splat %arg8 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc17)
%30 = arith.muli %28, %29 : tensor<1x128xi32, #blocked1> loc(#loc17)
%31 = tt.broadcast %25 : tensor<64x1x!tt.ptr<f16>, #blocked> -> tensor<64x128x!tt.ptr<f16>, #blocked> loc(#loc18)
%32 = triton_gpu.convert_layout %31 : tensor<64x128x!tt.ptr<f16>, #blocked> -> tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc18)
%33 = tt.broadcast %30 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc18)
%34 = tt.addptr %32, %33 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc18)
%35 = tt.addptr %34, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc19)
%36 = arith.addi %0, %c1_i32 : i32 loc(#loc20)
%37 = triton_gpu.convert_layout %1 : tensor<256xi32, #blocked3> -> tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc21)
%38 = tt.expand_dims %37 {axis = 1 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi32, #blocked4> loc(#loc21)
%39 = triton_gpu.convert_layout %38 : tensor<256x1xi32, #blocked4> -> tensor<256x1xi32, #blocked> loc(#loc22)
%40 = arith.muli %39, %cst : tensor<256x1xi32, #blocked> loc(#loc22)
%41 = triton_gpu.convert_layout %3 : tensor<128xi32, #blocked3> -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc23)
%42 = tt.expand_dims %41 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi32, #blocked5> loc(#loc23)
%43 = triton_gpu.convert_layout %42 : tensor<1x128xi32, #blocked5> -> tensor<1x128xi32, #blocked1> loc(#loc24)
%44 = tt.broadcast %43 : tensor<1x128xi32, #blocked1> -> tensor<256x128xi32, #blocked1> loc(#loc24)
%45 = tt.splat %arg9 : i32 -> tensor<256x1xi32, #blocked> loc(#loc25)
%46 = arith.muli %8, %45 : tensor<256x1xi32, #blocked> loc(#loc25)
%47 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<256x1x!tt.ptr<f16>, #blocked> loc(#loc26)
%48 = tt.addptr %47, %46 : tensor<256x1x!tt.ptr<f16>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc26)
%49 = tt.broadcast %48 : tensor<256x1x!tt.ptr<f16>, #blocked> -> tensor<256x128x!tt.ptr<f16>, #blocked> loc(#loc27)
%50 = triton_gpu.convert_layout %49 : tensor<256x128x!tt.ptr<f16>, #blocked> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc27)
%51 = tt.broadcast %28 : tensor<1x128xi32, #blocked1> -> tensor<256x128xi32, #blocked1> loc(#loc27)
%52 = tt.addptr %50, %51 : tensor<256x128x!tt.ptr<f16>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc27)
%53 = tt.splat %arg4 : i32 -> tensor<256xi32, #blocked3> loc(#loc28)
%54 = arith.cmpi slt, %2, %53 : tensor<256xi32, #blocked3> loc(#loc28)
%55 = triton_gpu.convert_layout %54 : tensor<256xi1, #blocked3> -> tensor<256xi1, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc29)
%56 = tt.expand_dims %55 {axis = 1 : i32} : tensor<256xi1, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi1, #blocked4> loc(#loc29)
%57 = triton_gpu.convert_layout %56 : tensor<256x1xi1, #blocked4> -> tensor<256x1xi1, #blocked> loc(#loc30)
%58 = tt.splat %arg5 : i32 -> tensor<128xi32, #blocked3> loc(#loc31)
%59 = arith.cmpi slt, %4, %58 : tensor<128xi32, #blocked3> loc(#loc31)
%60 = triton_gpu.convert_layout %59 : tensor<128xi1, #blocked3> -> tensor<128xi1, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc32)
%61 = tt.expand_dims %60 {axis = 0 : i32} : tensor<128xi1, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi1, #blocked5> loc(#loc32)
%62 = triton_gpu.convert_layout %61 : tensor<1x128xi1, #blocked5> -> tensor<1x128xi1, #blocked1> loc(#loc30)
%63 = tt.broadcast %57 : tensor<256x1xi1, #blocked> -> tensor<256x128xi1, #blocked> loc(#loc30)
%64 = triton_gpu.convert_layout %63 : tensor<256x128xi1, #blocked> -> tensor<256x128xi1, #blocked1> loc(#loc30)
%65 = tt.broadcast %62 : tensor<1x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc30)
%66 = arith.andi %64, %65 : tensor<256x128xi1, #blocked1> loc(#loc30)
scf.for %arg10 = %c0_i32 to %c3_i32 step %c1_i32 : i32 {
%67:3 = scf.for %arg11 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg12 = %cst_0, %arg13 = %20, %arg14 = %35) -> (tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
%73 = triton_gpu.convert_layout %arg13 : tensor<256x64x!tt.ptr<f16>, #blocked2> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc35)
%74 = tt.load %73 : tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc35)
%75 = triton_gpu.convert_layout %arg14 : tensor<64x128x!tt.ptr<f16>, #blocked1> -> tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc36)
%76 = tt.load %75 : tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc36)
%77 = triton_gpu.convert_layout %76 : tensor<64x128xf16, #blocked6> -> tensor<64x128xf16, #blocked1> loc(#loc36)
%78 = triton_gpu.convert_layout %74 : tensor<256x64xf16, #blocked2> -> tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> loc(#loc35)
%79 = triton_gpu.convert_layout %77 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> loc(#loc36)
%80 = triton_gpu.convert_layout %arg12 : tensor<256x128xf32, #blocked1> -> tensor<256x128xf32, #blocked7> loc(#loc37)
%81 = tt.dot %78, %79, %80 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> -> tensor<256x128xf32, #blocked7> loc(#loc38)
%82 = triton_gpu.convert_layout %81 : tensor<256x128xf32, #blocked7> -> tensor<256x128xf32, #blocked1> loc(#loc39)
%83 = tt.addptr %arg13, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc40)
%84 = tt.addptr %arg14, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41)
scf.yield %82, %83, %84 : tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc39)
} loc(#loc34)
%68:2 = scf.while (%arg11 = %67#0, %arg12 = %36) : (tensor<256x128xf32, #blocked1>, i32) -> (tensor<256x128xf32, #blocked1>, i32) {
%73 = arith.cmpi slt, %arg12, %arg6 : i32 loc(#loc43)
scf.condition(%73) %arg11, %arg12 : tensor<256x128xf32, #blocked1>, i32 loc(#loc43)
} do {
^bb0(%arg11: tensor<256x128xf32, #blocked1> loc(unknown), %arg12: i32 loc(unknown)):
%73 = arith.muli %arg12, %c32768_i32 : i32 loc(#loc44)
%74 = tt.addptr %arg3, %73 : !tt.ptr<f32>, i32 loc(#loc45)
%75 = tt.splat %74 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc46)
%76 = tt.addptr %75, %40 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc46)
%77 = tt.broadcast %76 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked> loc(#loc24)
%78 = triton_gpu.convert_layout %77 : tensor<256x128x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc24)
%79 = tt.addptr %78, %44 : tensor<256x128x!tt.ptr<f32>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc24)
%80 = triton_gpu.convert_layout %79 : tensor<256x128x!tt.ptr<f32>, #blocked1> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
%81 = tt.load %80 : tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
%82 = arith.addf %arg11, %81 : tensor<256x128xf32, #blocked1> loc(#loc48)
%83 = arith.addi %arg12, %c1_i32 : i32 loc(#loc49)
scf.yield %82, %83 : tensor<256x128xf32, #blocked1>, i32 loc(#loc50)
} loc(#loc42)
%69 = arith.truncf %68#0 : tensor<256x128xf32, #blocked1> to tensor<256x128xf16, #blocked1> loc(#loc51)
%70 = triton_gpu.convert_layout %52 : tensor<256x128x!tt.ptr<f16>, #blocked1> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
%71 = triton_gpu.convert_layout %69 : tensor<256x128xf16, #blocked1> -> tensor<256x128xf16, #blocked1> loc(#loc51)
%72 = triton_gpu.convert_layout %66 : tensor<256x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc51)
tt.store %70, %71, %72 : tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
} loc(#loc33)
tt.return loc(#loc52)
} loc(#loc)
} loc(#loc)
Fix 1
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked7 = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#loc = loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0)
#loc1 = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @kernel(%arg0: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg1: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg2: !tt.ptr<f16> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg3: !tt.ptr<f32> loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg4: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg5: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg6: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg7: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg8: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0), %arg9: i32 loc("/home/ravil/work/triton/triton/.vscode/streamk/./reproducible.py":8:0)) attributes {noinline = false} {
%c32768_i32 = arith.constant 32768 : i32 loc(#loc1)
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
%c3_i32 = arith.constant 3 : i32 loc(#loc1)
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
%cst = arith.constant dense<128> : tensor<256x1xi32, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #blocked1> loc(#loc1)
%cst_1 = arith.constant dense<64> : tensor<64x128xi32, #blocked1> loc(#loc1)
%cst_2 = arith.constant dense<64> : tensor<256x64xi32, #blocked2> loc(#loc1)
%cst_3 = arith.constant dense<128> : tensor<128xi32, #blocked3> loc(#loc1)
%cst_4 = arith.constant dense<256> : tensor<256xi32, #blocked3> loc(#loc1)
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%0 = tt.get_program_id x : i32 loc(#loc2)
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked3> loc(#loc3)
%2 = arith.addi %1, %cst_4 : tensor<256xi32, #blocked3> loc(#loc4)
%3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked3> loc(#loc5)
%4 = arith.addi %3, %cst_3 : tensor<128xi32, #blocked3> loc(#loc6)
%5 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked3> loc(#loc7)
%6 = triton_gpu.convert_layout %2 : tensor<256xi32, #blocked3> -> tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc8)
%7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi32, #blocked4> loc(#loc8)
%8 = triton_gpu.convert_layout %7 : tensor<256x1xi32, #blocked4> -> tensor<256x1xi32, #blocked> loc(#loc9)
%9 = tt.splat %arg7 : i32 -> tensor<256x1xi32, #blocked> loc(#loc9)
%10 = arith.muli %8, %9 : tensor<256x1xi32, #blocked> loc(#loc9)
%11 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<256x1x!tt.ptr<f16>, #blocked> loc(#loc10)
%12 = tt.addptr %11, %10 : tensor<256x1x!tt.ptr<f16>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc10)
%13 = triton_gpu.convert_layout %5 : tensor<64xi32, #blocked3> -> tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc11)
%14 = tt.expand_dims %13 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x64xi32, #blocked5> loc(#loc11)
%15 = triton_gpu.convert_layout %14 : tensor<1x64xi32, #blocked5> -> tensor<1x64xi32, #blocked2> loc(#loc12)
%16 = tt.broadcast %12 : tensor<256x1x!tt.ptr<f16>, #blocked> -> tensor<256x64x!tt.ptr<f16>, #blocked> loc(#loc12)
%17 = triton_gpu.convert_layout %16 : tensor<256x64x!tt.ptr<f16>, #blocked> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc12)
%18 = tt.broadcast %15 : tensor<1x64xi32, #blocked2> -> tensor<256x64xi32, #blocked2> loc(#loc12)
%19 = tt.addptr %17, %18 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc12)
%20 = tt.addptr %19, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc13)
%21 = triton_gpu.convert_layout %5 : tensor<64xi32, #blocked3> -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc14)
%22 = tt.expand_dims %21 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1xi32, #blocked4> loc(#loc14)
%23 = triton_gpu.convert_layout %22 : tensor<64x1xi32, #blocked4> -> tensor<64x1xi32, #blocked> loc(#loc15)
%24 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>, #blocked> loc(#loc15)
%25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc15)
%26 = triton_gpu.convert_layout %4 : tensor<128xi32, #blocked3> -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc16)
%27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi32, #blocked5> loc(#loc16)
%28 = triton_gpu.convert_layout %27 : tensor<1x128xi32, #blocked5> -> tensor<1x128xi32, #blocked1> loc(#loc17)
%29 = tt.splat %arg8 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc17)
%30 = arith.muli %28, %29 : tensor<1x128xi32, #blocked1> loc(#loc17)
%31 = tt.broadcast %25 : tensor<64x1x!tt.ptr<f16>, #blocked> -> tensor<64x128x!tt.ptr<f16>, #blocked> loc(#loc18)
%32 = triton_gpu.convert_layout %31 : tensor<64x128x!tt.ptr<f16>, #blocked> -> tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc18)
%33 = tt.broadcast %30 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1> loc(#loc18)
%34 = tt.addptr %32, %33 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc18)
%35 = tt.addptr %34, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc19)
%36 = arith.addi %0, %c1_i32 : i32 loc(#loc20)
%37 = triton_gpu.convert_layout %1 : tensor<256xi32, #blocked3> -> tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc21)
%38 = tt.expand_dims %37 {axis = 1 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi32, #blocked4> loc(#loc21)
%39 = triton_gpu.convert_layout %38 : tensor<256x1xi32, #blocked4> -> tensor<256x1xi32, #blocked> loc(#loc22)
%40 = arith.muli %39, %cst : tensor<256x1xi32, #blocked> loc(#loc22)
%41 = triton_gpu.convert_layout %3 : tensor<128xi32, #blocked3> -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc23)
%42 = tt.expand_dims %41 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi32, #blocked5> loc(#loc23)
%43 = triton_gpu.convert_layout %42 : tensor<1x128xi32, #blocked5> -> tensor<1x128xi32, #blocked1> loc(#loc24)
%44 = tt.broadcast %43 : tensor<1x128xi32, #blocked1> -> tensor<256x128xi32, #blocked1> loc(#loc24)
%45 = tt.splat %arg9 : i32 -> tensor<256x1xi32, #blocked> loc(#loc25)
%46 = arith.muli %8, %45 : tensor<256x1xi32, #blocked> loc(#loc25)
%47 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<256x1x!tt.ptr<f16>, #blocked> loc(#loc26)
%48 = tt.addptr %47, %46 : tensor<256x1x!tt.ptr<f16>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc26)
%49 = tt.broadcast %48 : tensor<256x1x!tt.ptr<f16>, #blocked> -> tensor<256x128x!tt.ptr<f16>, #blocked> loc(#loc27)
%50 = triton_gpu.convert_layout %49 : tensor<256x128x!tt.ptr<f16>, #blocked> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc27)
%51 = tt.broadcast %28 : tensor<1x128xi32, #blocked1> -> tensor<256x128xi32, #blocked1> loc(#loc27)
%52 = tt.addptr %50, %51 : tensor<256x128x!tt.ptr<f16>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc27)
%53 = tt.splat %arg4 : i32 -> tensor<256xi32, #blocked3> loc(#loc28)
%54 = arith.cmpi slt, %2, %53 : tensor<256xi32, #blocked3> loc(#loc28)
%55 = triton_gpu.convert_layout %54 : tensor<256xi1, #blocked3> -> tensor<256xi1, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc29)
%56 = tt.expand_dims %55 {axis = 1 : i32} : tensor<256xi1, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1xi1, #blocked4> loc(#loc29)
%57 = triton_gpu.convert_layout %56 : tensor<256x1xi1, #blocked4> -> tensor<256x1xi1, #blocked> loc(#loc30)
%58 = tt.splat %arg5 : i32 -> tensor<128xi32, #blocked3> loc(#loc31)
%59 = arith.cmpi slt, %4, %58 : tensor<128xi32, #blocked3> loc(#loc31)
%60 = triton_gpu.convert_layout %59 : tensor<128xi1, #blocked3> -> tensor<128xi1, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc32)
%61 = tt.expand_dims %60 {axis = 0 : i32} : tensor<128xi1, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x128xi1, #blocked5> loc(#loc32)
%62 = triton_gpu.convert_layout %61 : tensor<1x128xi1, #blocked5> -> tensor<1x128xi1, #blocked1> loc(#loc30)
%63 = tt.broadcast %57 : tensor<256x1xi1, #blocked> -> tensor<256x128xi1, #blocked> loc(#loc30)
%64 = triton_gpu.convert_layout %63 : tensor<256x128xi1, #blocked> -> tensor<256x128xi1, #blocked1> loc(#loc30)
%65 = tt.broadcast %62 : tensor<1x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc30)
%66 = arith.andi %64, %65 : tensor<256x128xi1, #blocked1> loc(#loc30)
%67 = scf.for %arg10 = %c0_i32 to %c3_i32 step %c1_i32 iter_args(%arg11 = %cst_0) -> (tensor<256x128xf32, #blocked1>) : i32 {
%68 = arith.mulf %arg11, %cst_0 : tensor<256x128xf32, #blocked1> loc(#loc34)
%69:3 = scf.for %arg12 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg13 = %68, %arg14 = %20, %arg15 = %35) -> (tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
%75 = triton_gpu.convert_layout %arg14 : tensor<256x64x!tt.ptr<f16>, #blocked2> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc36)
%76 = tt.load %75 : tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc36)
%77 = triton_gpu.convert_layout %arg15 : tensor<64x128x!tt.ptr<f16>, #blocked1> -> tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc37)
%78 = tt.load %77 : tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc37)
%79 = triton_gpu.convert_layout %78 : tensor<64x128xf16, #blocked6> -> tensor<64x128xf16, #blocked1> loc(#loc37)
%80 = triton_gpu.convert_layout %76 : tensor<256x64xf16, #blocked2> -> tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> loc(#loc36)
%81 = triton_gpu.convert_layout %79 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> loc(#loc37)
%82 = triton_gpu.convert_layout %arg13 : tensor<256x128xf32, #blocked1> -> tensor<256x128xf32, #blocked7> loc(#loc34)
%83 = tt.dot %80, %81, %82 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> -> tensor<256x128xf32, #blocked7> loc(#loc38)
%84 = triton_gpu.convert_layout %83 : tensor<256x128xf32, #blocked7> -> tensor<256x128xf32, #blocked1> loc(#loc39)
%85 = tt.addptr %arg14, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc40)
%86 = tt.addptr %arg15, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41)
scf.yield %84, %85, %86 : tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc39)
} loc(#loc35)
%70:2 = scf.while (%arg12 = %69#0, %arg13 = %36) : (tensor<256x128xf32, #blocked1>, i32) -> (tensor<256x128xf32, #blocked1>, i32) {
%75 = arith.cmpi slt, %arg13, %arg6 : i32 loc(#loc43)
scf.condition(%75) %arg12, %arg13 : tensor<256x128xf32, #blocked1>, i32 loc(#loc43)
} do {
^bb0(%arg12: tensor<256x128xf32, #blocked1> loc(unknown), %arg13: i32 loc(unknown)):
%75 = arith.muli %arg13, %c32768_i32 : i32 loc(#loc44)
%76 = tt.addptr %arg3, %75 : !tt.ptr<f32>, i32 loc(#loc45)
%77 = tt.splat %76 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc46)
%78 = tt.addptr %77, %40 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc46)
%79 = tt.broadcast %78 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked> loc(#loc24)
%80 = triton_gpu.convert_layout %79 : tensor<256x128x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc24)
%81 = tt.addptr %80, %44 : tensor<256x128x!tt.ptr<f32>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc24)
%82 = triton_gpu.convert_layout %81 : tensor<256x128x!tt.ptr<f32>, #blocked1> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
%83 = tt.load %82 : tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
%84 = arith.addf %arg12, %83 : tensor<256x128xf32, #blocked1> loc(#loc48)
%85 = arith.addi %arg13, %c1_i32 : i32 loc(#loc49)
scf.yield %84, %85 : tensor<256x128xf32, #blocked1>, i32 loc(#loc50)
} loc(#loc42)
%71 = arith.truncf %70#0 : tensor<256x128xf32, #blocked1> to tensor<256x128xf16, #blocked1> loc(#loc51)
%72 = triton_gpu.convert_layout %52 : tensor<256x128x!tt.ptr<f16>, #blocked1> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
%73 = triton_gpu.convert_layout %71 : tensor<256x128xf16, #blocked1> -> tensor<256x128xf16, #blocked1> loc(#loc51)
%74 = triton_gpu.convert_layout %66 : tensor<256x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc51)
tt.store %72, %73, %74 : tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
scf.yield %70#0 : tensor<256x128xf32, #blocked1> loc(#loc52)
} loc(#loc33)
tt.return loc(#loc53)
} loc(#loc)
} loc(#loc)
Diff: Reproducible vs Fix 1
92,110c92,111
< scf.for %arg10 = %c0_i32 to %c3_i32 step %c1_i32 : i32 {
< %67:3 = scf.for %arg11 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg12 = %cst_0, %arg13 = %20, %arg14 = %35) -> (tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
< %73 = triton_gpu.convert_layout %arg13 : tensor<256x64x!tt.ptr<f16>, #blocked2> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc35)
< %74 = tt.load %73 : tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc35)
< %75 = triton_gpu.convert_layout %arg14 : tensor<64x128x!tt.ptr<f16>, #blocked1> -> tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc36)
< %76 = tt.load %75 : tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc36)
< %77 = triton_gpu.convert_layout %76 : tensor<64x128xf16, #blocked6> -> tensor<64x128xf16, #blocked1> loc(#loc36)
< %78 = triton_gpu.convert_layout %74 : tensor<256x64xf16, #blocked2> -> tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> loc(#loc35)
< %79 = triton_gpu.convert_layout %77 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> loc(#loc36)
< %80 = triton_gpu.convert_layout %arg12 : tensor<256x128xf32, #blocked1> -> tensor<256x128xf32, #blocked7> loc(#loc37)
< %81 = tt.dot %78, %79, %80 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> -> tensor<256x128xf32, #blocked7> loc(#loc38)
< %82 = triton_gpu.convert_layout %81 : tensor<256x128xf32, #blocked7> -> tensor<256x128xf32, #blocked1> loc(#loc39)
< %83 = tt.addptr %arg13, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc40)
< %84 = tt.addptr %arg14, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41)
< scf.yield %82, %83, %84 : tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc39)
< } loc(#loc34)
< %68:2 = scf.while (%arg11 = %67#0, %arg12 = %36) : (tensor<256x128xf32, #blocked1>, i32) -> (tensor<256x128xf32, #blocked1>, i32) {
< %73 = arith.cmpi slt, %arg12, %arg6 : i32 loc(#loc43)
< scf.condition(%73) %arg11, %arg12 : tensor<256x128xf32, #blocked1>, i32 loc(#loc43)
---
> %67 = scf.for %arg10 = %c0_i32 to %c3_i32 step %c1_i32 iter_args(%arg11 = %cst_0) -> (tensor<256x128xf32, #blocked1>) : i32 {
> %68 = arith.mulf %arg11, %cst_0 : tensor<256x128xf32, #blocked1> loc(#loc34)
> %69:3 = scf.for %arg12 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg13 = %68, %arg14 = %20, %arg15 = %35) -> (tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
> %75 = triton_gpu.convert_layout %arg14 : tensor<256x64x!tt.ptr<f16>, #blocked2> -> tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc36)
> %76 = tt.load %75 : tensor<256x64x!tt.ptr<f16>, #blocked2> loc(#loc36)
> %77 = triton_gpu.convert_layout %arg15 : tensor<64x128x!tt.ptr<f16>, #blocked1> -> tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc37)
> %78 = tt.load %77 : tensor<64x128x!tt.ptr<f16>, #blocked6> loc(#loc37)
> %79 = triton_gpu.convert_layout %78 : tensor<64x128xf16, #blocked6> -> tensor<64x128xf16, #blocked1> loc(#loc37)
> %80 = triton_gpu.convert_layout %76 : tensor<256x64xf16, #blocked2> -> tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> loc(#loc36)
> %81 = triton_gpu.convert_layout %79 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> loc(#loc37)
> %82 = triton_gpu.convert_layout %arg13 : tensor<256x128xf32, #blocked1> -> tensor<256x128xf32, #blocked7> loc(#loc34)
> %83 = tt.dot %80, %81, %82 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked7}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked7}>> -> tensor<256x128xf32, #blocked7> loc(#loc38)
> %84 = triton_gpu.convert_layout %83 : tensor<256x128xf32, #blocked7> -> tensor<256x128xf32, #blocked1> loc(#loc39)
> %85 = tt.addptr %arg14, %cst_2 : tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<256x64xi32, #blocked2> loc(#loc40)
> %86 = tt.addptr %arg15, %cst_1 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1> loc(#loc41)
> scf.yield %84, %85, %86 : tensor<256x128xf32, #blocked1>, tensor<256x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1> loc(#loc39)
> } loc(#loc35)
> %70:2 = scf.while (%arg12 = %69#0, %arg13 = %36) : (tensor<256x128xf32, #blocked1>, i32) -> (tensor<256x128xf32, #blocked1>, i32) {
> %75 = arith.cmpi slt, %arg13, %arg6 : i32 loc(#loc43)
> scf.condition(%75) %arg12, %arg13 : tensor<256x128xf32, #blocked1>, i32 loc(#loc43)
112,124c113,125
< ^bb0(%arg11: tensor<256x128xf32, #blocked1> loc(unknown), %arg12: i32 loc(unknown)):
< %73 = arith.muli %arg12, %c32768_i32 : i32 loc(#loc44)
< %74 = tt.addptr %arg3, %73 : !tt.ptr<f32>, i32 loc(#loc45)
< %75 = tt.splat %74 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc46)
< %76 = tt.addptr %75, %40 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc46)
< %77 = tt.broadcast %76 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked> loc(#loc24)
< %78 = triton_gpu.convert_layout %77 : tensor<256x128x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc24)
< %79 = tt.addptr %78, %44 : tensor<256x128x!tt.ptr<f32>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc24)
< %80 = triton_gpu.convert_layout %79 : tensor<256x128x!tt.ptr<f32>, #blocked1> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
< %81 = tt.load %80 : tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
< %82 = arith.addf %arg11, %81 : tensor<256x128xf32, #blocked1> loc(#loc48)
< %83 = arith.addi %arg12, %c1_i32 : i32 loc(#loc49)
< scf.yield %82, %83 : tensor<256x128xf32, #blocked1>, i32 loc(#loc50)
---
> ^bb0(%arg12: tensor<256x128xf32, #blocked1> loc(unknown), %arg13: i32 loc(unknown)):
> %75 = arith.muli %arg13, %c32768_i32 : i32 loc(#loc44)
> %76 = tt.addptr %arg3, %75 : !tt.ptr<f32>, i32 loc(#loc45)
> %77 = tt.splat %76 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc46)
> %78 = tt.addptr %77, %40 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc46)
> %79 = tt.broadcast %78 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked> loc(#loc24)
> %80 = triton_gpu.convert_layout %79 : tensor<256x128x!tt.ptr<f32>, #blocked> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc24)
> %81 = tt.addptr %80, %44 : tensor<256x128x!tt.ptr<f32>, #blocked1>, tensor<256x128xi32, #blocked1> loc(#loc24)
> %82 = triton_gpu.convert_layout %81 : tensor<256x128x!tt.ptr<f32>, #blocked1> -> tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
> %83 = tt.load %82 : tensor<256x128x!tt.ptr<f32>, #blocked1> loc(#loc47)
> %84 = arith.addf %arg12, %83 : tensor<256x128xf32, #blocked1> loc(#loc48)
> %85 = arith.addi %arg13, %c1_i32 : i32 loc(#loc49)
> scf.yield %84, %85 : tensor<256x128xf32, #blocked1>, i32 loc(#loc50)
126,130c127,132
< %69 = arith.truncf %68#0 : tensor<256x128xf32, #blocked1> to tensor<256x128xf16, #blocked1> loc(#loc51)
< %70 = triton_gpu.convert_layout %52 : tensor<256x128x!tt.ptr<f16>, #blocked1> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
< %71 = triton_gpu.convert_layout %69 : tensor<256x128xf16, #blocked1> -> tensor<256x128xf16, #blocked1> loc(#loc51)
< %72 = triton_gpu.convert_layout %66 : tensor<256x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc51)
< tt.store %70, %71, %72 : tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
---
> %71 = arith.truncf %70#0 : tensor<256x128xf32, #blocked1> to tensor<256x128xf16, #blocked1> loc(#loc51)
> %72 = triton_gpu.convert_layout %52 : tensor<256x128x!tt.ptr<f16>, #blocked1> -> tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
> %73 = triton_gpu.convert_layout %71 : tensor<256x128xf16, #blocked1> -> tensor<256x128xf16, #blocked1> loc(#loc51)
> %74 = triton_gpu.convert_layout %66 : tensor<256x128xi1, #blocked1> -> tensor<256x128xi1, #blocked1> loc(#loc51)
> tt.store %72, %73, %74 : tensor<256x128x!tt.ptr<f16>, #blocked1> loc(#loc51)
> scf.yield %70#0 : tensor<256x128xf32, #blocked1> loc(#loc52)
132c134
< tt.return loc(#loc52)
---
> tt.return loc(#loc53)
from triton.
LayoutPropagation Dump after layoutPropagation.propagateLayout()
Reproducible
Value: %74 = tt.load %73 : tensor<256x64x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %76 = tt.load %75 : tensor<64x128x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
--
Value: %81 = tt.dot %78, %79, %80 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>> -> tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %81 = tt.load %80 : tensor<256x128x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
--
Fix 1
Value: %76 = tt.load %75 : tensor<256x64x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %78 = tt.load %77 : tensor<64x128x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
--
Value: %83 = tt.dot %80, %81, %82 : tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>> -> tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %83 = tt.load %82 : tensor<256x128x!tt.ptr<f32>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
--
Value: %84 = arith.addf %arg12, %83 : tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: <block argument> of type 'tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>' at index: 0
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %69:3 = scf.for %arg12 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg13 = %68, %arg14 = %20, %arg15 = %35) -> (tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>, tensor<256x64x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>>, tensor<64x128x!tt.ptr<f16>, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>) : i32 {...}
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: <block argument> of type 'tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>' at index: 0
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %70:2 = scf.while (%arg12 = %69#0, %arg13 = %36) : (tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>, i32) -> (tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>, i32) {...} do {...}
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %67 = scf.for %arg10 = %c0_i32 to %c3_i32 step %c1_i32 iter_args(%arg11 = %cst_0) -> (tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>) : i32 {...}
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: <block argument> of type 'tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>' at index: 1
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %71 = arith.truncf %70#0 : tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>> to tensor<256x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %73 = triton_gpu.convert_layout %71 : tensor<256x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>> -> tensor<256x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %68 = arith.mulf %arg11, %cst_0 : tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: <block argument> of type 'tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>' at index: 1
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %82 = triton_gpu.convert_layout %arg13 : tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>> -> tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %84 = triton_gpu.convert_layout %83 : tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>> -> tensor<256x128xf32, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
--
Value: %79 = triton_gpu.convert_layout %78 : tensor<64x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>> -> tensor<64x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
--
Value: %81 = triton_gpu.convert_layout %79 : tensor<64x128xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>> -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
--
Value: %80 = triton_gpu.convert_layout %76 : tensor<256x64xf16, #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>> -> tensor<256x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #triton_gpu.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>>
encoding:
#triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
--
from triton.
Related Issues (20)
- Support masking in atomic_cas?
- Triton Error [CUDA]: device kernel image is invalid HOT 1
- Cannot specify which device to use
- flash-attention gradient calculation fail due to numerical error HOT 1
- Help build WHL for Windows
- How to add a pow function in python.triton.language.core? HOT 1
- tensor not support index in nvidia gpu?
- No matching distribution found for triton-nightly
- cdot minimal size operation?
- How to create a constant in kernel function
- It raises error when I run 06-fused-attention.py
- How to understand getPtrAlignment in axisinfo? HOT 1
- `03-matrix-multiplication.py` results in inaccurate numerics on H100 and MI300 using `float32` HOT 4
- Pytorch/Triton of 2.3.0 version loads bool type mask in triton, encountering error when use tl.where HOT 4
- Unable to download llvm when building from source. HOT 9
- can we use triton kernel in our model when using torch.onnx.export
- [BUG] Using both reduction and atomic operations along with autotune makes incorrect results HOT 2
- cross-compile triton kernels with tools/compile.py
- Enhance triton functionality to allow use of custom cuda toolkit instead of downloading specific version from conda
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from triton.