executable_target_elf = #hal.executable.target<"amd-aie", "elf", {target_arch = "chip-tbd"}>
#map = affine_map<(d0) -> (d0 * 16)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__transform_main>
#device_target_amd_aie = #hal.device.target<"amd-aie", {executable_targets = [#executable_target_elf], legacy_sync}>
module attributes {hal.device.targets = [#device_target_amd_aie]} {
hal.executable private @matmul_example_dispatch_0 {
hal.executable.variant public @elf target(#executable_target_elf) {
hal.executable.export public @matmul_example_dispatch_0_matmul_4x2048x2048_i8xi8xi32 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device):
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
hal.return %c32, %c1, %c1 : index, index, index
}
builtin.module {
func.func @matmul_example_dispatch_0_matmul_4x2048x2048_i8xi8xi32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c2048 = arith.constant 2048 : index
%c0_i8 = arith.constant 0 : i8
%alloc = memref.alloc() : memref<1x1x8x4x4x8xi32, "local">
%alloc_0 = memref.alloc() : memref<1x1x8x8x8x8xi8, "local">
%alloc_1 = memref.alloc() : memref<1x1x8x4x4x8xi8, "local">
%alloc_2 = memref.alloc() : memref<1x1x16x64xi32, "shared">
%alloc_3 = memref.alloc() : memref<1x1x64x64xi8, "shared">
%alloc_4 = memref.alloc() : memref<1x1x16x64xi8, "shared">
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<4x2048xi8, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<4x2048xi8, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi8, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2048x2048xi8, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<4x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<4x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (1, 32) {
%3 = affine.apply #map(%arg0)
%4 = affine.apply #map1(%arg1)
%subview = memref.subview %2[%3, %4] [4, 64] [1, 1] : memref<4x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<4x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<4x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
scf.for %arg2 = %c0 to %c2048 step %c64 {
%5 = affine.apply #map(%arg0)
%subview_5 = memref.subview %0[%5, %arg2] [4, 64] [1, 1] : memref<4x2048xi8, #hal.descriptor_type<storage_buffer>> to memref<4x64xi8, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%6 = affine.apply #map1(%arg1)
%subview_6 = memref.subview %1[%arg2, %6] [64, 64] [1, 1] : memref<2048x2048xi8, #hal.descriptor_type<storage_buffer>> to memref<64x64xi8, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
iree_linalg_ext.pack %subview_5 padding_value(%c0_i8 : i8) inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %alloc_4 : (memref<4x64xi8, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi8, "shared">)
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_3 : (memref<64x64xi8, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x64xi8, "shared">)
iree_linalg_ext.pack %subview padding_value(%c0_i32 : i32) inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %alloc_2 : (memref<4x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x16x64xi32, "shared">)
scf.forall (%arg3, %arg4) in (1, 1) {
%subview_7 = memref.subview %alloc_4[%arg3, 0, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : memref<1x1x16x64xi8, "shared"> to memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, "shared">
%subview_8 = memref.subview %alloc_3[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi8, "shared"> to memref<1x1x64x64xi8, strided<[4096, 4096, 64, 1], offset: ?>, "shared">
%subview_9 = memref.subview %alloc_2[%arg3, %arg4, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : memref<1x1x16x64xi32, "shared"> to memref<1x1x16x64xi32, strided<[1024, 1024, 64, 1], offset: ?>, "shared">
iree_linalg_ext.pack %subview_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x16x64xi8, strided<[1024, 1024, 64, 1], offset: ?>, "shared"> memref<1x1x8x4x4x8xi8, "local">)
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 8] into %alloc_0 : (memref<1x1x64x64xi8, strided<[4096, 4096, 64, 1], offset: ?>, "shared"> memref<1x1x8x8x8x8xi8, "local">)
iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc : (memref<1x1x16x64xi32, strided<[1024, 1024, 64, 1], offset: ?>, "shared"> memref<1x1x8x4x4x8xi32, "local">)
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_1, %alloc_0 : memref<1x1x8x4x4x8xi8, "local">, memref<1x1x8x8x8x8xi8, "local">) outs(%alloc : memref<1x1x8x4x4x8xi32, "local">) {
^bb0(%in: i8, %in_10: i8, %out: i32):
%7 = arith.extsi %in : i8 to i32
%8 = arith.extsi %in_10 : i8 to i32
%9 = arith.muli %7, %8 : i32
%10 = arith.addi %out, %9 : i32
linalg.yield %10 : i32
}
iree_linalg_ext.unpack %alloc outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %subview_9 : (memref<1x1x8x4x4x8xi32, "local"> memref<1x1x16x64xi32, strided<[1024, 1024, 64, 1], offset: ?>, "shared">)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
iree_linalg_ext.unpack %alloc_2 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %subview : (memref<1x1x16x64xi32, "shared"> memref<4x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
memref.dealloc %alloc_4 : memref<1x1x16x64xi8, "shared">
memref.dealloc %alloc_3 : memref<1x1x64x64xi8, "shared">
memref.dealloc %alloc_2 : memref<1x1x16x64xi32, "shared">
memref.dealloc %alloc_1 : memref<1x1x8x4x4x8xi8, "local">
memref.dealloc %alloc_0 : memref<1x1x8x8x8x8xi8, "local">
memref.dealloc %alloc : memref<1x1x8x4x4x8xi32, "local">
return
}
}
}
}
}