-
Notifications
You must be signed in to change notification settings - Fork 663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vulkan compile errors for llama model from sharktank #17304
Comments
Tried with
|
The source for hal.executable public @prefill_bs4$async_dispatch_1 {
hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, PhysicalStorageBufferAddresses, VariablePointers, VariablePointersStorageBuffer, DotProduct, DotProductInputAll, DotProductInput4x8BitPacked, DotProductInput4x8Bit, CooperativeMatrixKHR], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_integer_dot_product, SPV_KHR_storage_buffer_storage_class, SPV_KHR_physical_storage_buffer, SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>) {
hal.executable.export public @prefill_bs4$async_dispatch_1_generic_4xDx3200_i64xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @prefill_bs4$async_dispatch_1_generic_4xDx3200_i64xf32() attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 1, 1]>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %1 : i32 to i64
%5 = arith.shli %4, %c32_i64 : i64
%6 = arith.extui %0 : i32 to i64
%7 = arith.ori %6, %5 : i64
%8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
%9 = arith.extui %3 : i32 to i64
%10 = arith.shli %9, %c32_i64 : i64
%11 = arith.extui %2 : i32 to i64
%12 = arith.ori %11, %10 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x?xi64>>{%15}
%17 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<4x?x3200xf32>>{%15}
%18 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [32000, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x3200xf16>> -> tensor<32000x3200xf16>
%19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [4, %15], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x?xi64>>{%15} -> tensor<4x?xi64>
%20 = tensor.empty(%15) : tensor<4x?x3200xf32>
%21 = tensor.empty() : tensor<32000x3200xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<32000x3200xf16>) outs(%21 : tensor<32000x3200xf32>) {
^bb0(%in: f16, %out: f32):
%24 = arith.extf %in : f16 to f32
linalg.yield %24 : f32
} -> tensor<32000x3200xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%19 : tensor<4x?xi64>) outs(%20 : tensor<4x?x3200xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 32], [1, 1, 1]]>} {
^bb0(%in: i64, %out: f32):
%24 = arith.index_cast %in : i64 to index
%25 = linalg.index 2 : index
%extracted = tensor.extract %22[%24, %25] : tensor<32000x3200xf32>
linalg.yield %extracted : f32
} -> tensor<4x?x3200xf32>
flow.dispatch.tensor.store %23, %17, offsets = [0, 0, 0], sizes = [4, %15, 3200], strides = [1, 1, 1] : tensor<4x?x3200xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x?x3200xf32>>{%15}
return
}
}
}
} |
The I still see the hal.executable public @prefill_bs4$async_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, PhysicalStorageBufferAddresses, VariablePointers, VariablePointersStorageBuffer, DotProduct, DotProductInputAll, DotProductInput4x8BitPacked, DotProductInput4x8Bit, CooperativeMatrixKHR], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_integer_dot_product, SPV_KHR_storage_buffer_storage_class, SPV_KHR_physical_storage_buffer, SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>) {
hal.executable.export public @prefill_bs4$async_dispatch_0_conv_4xDxD_i1xi64xf32xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]} {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @prefill_bs4$async_dispatch_0_conv_4xDxD_i1xi64xf32xf32() attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 1, 1]>} {
%c0 = arith.constant 0 : index
%c819200 = arith.constant 819200 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0xFF800000 : f32
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = arith.extui %1 : i32 to i64
%3 = arith.shli %2, %c32_i64 : i64
%4 = arith.extui %0 : i32 to i64
%5 = arith.ori %4, %3 : i64
%6 = arith.index_castui %5 : i64 to index
%7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4xi64>>
%8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<f32>>
%9 = flow.dispatch.workload.ordinal %6, 0 : index
%10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c819200) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x?x2048xi8>>{%9}
%11 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x?x?xf32>>{%9, %9}
%12 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xi64>> -> tensor<4xi64>
%13 = flow.dispatch.tensor.load %8, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
%14 = tensor.empty(%9, %9) : tensor<4x?x?xf32>
%15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 1, %9, %9], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x?x2048xi8>>{%9} -> tensor<?x?xi8>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %12, %13 : tensor<?x?xi8>, tensor<4xi64>, tensor<f32>) outs(%14 : tensor<4x?x?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 1, 32], [1, 1, 1]]>} {
^bb0(%in: i8, %in_0: i64, %in_1: f32, %out: f32):
%17 = arith.trunci %in : i8 to i1
%18 = linalg.index 2 : index
%19 = arith.index_cast %18 : index to i64
%20 = arith.cmpi sge, %19, %in_0 : i64
%21 = arith.addi %17, %20 : i1
%22 = arith.select %21, %cst, %in_1 : f32
linalg.yield %22 : f32
} -> tensor<4x?x?xf32>
flow.dispatch.tensor.store %16, %11, offsets = [0, 0, 0], sizes = [4, %9, %9], strides = [1, 1, 1] : tensor<4x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x?x?xf32>>{%9, %9}
return
}
}
}
} |
Copying from nod-ai/shark-ai#22 (comment):
|
llama model export has changed substantially since this was filed. Calling this stale. Could file a new issue if a recent export fails to compile for Vulkan too. |
What happened?
I'm following the examples in sharktank to export the f16 GGUF file from https://huggingface.co/SlyEcho/open_llama_3b_v2_gguf . When I try to compile through IREE CPU (
--iree-hal-target-backends=llvm-cpu
) I hit #17244, for Vulkan (--iree-hal-target-backends=vulkan-spirv
), I hit these errors.Steps to reproduce your issue
open_llama_3b_v2_f16.mlir
: https://sharkpublic.blob.core.windows.net/sharkpublic/scotttodd/issue_reports/open_llama_3b_v2_f16.mlir (or re-export with https://github.com/nod-ai/sharktank/tree/main/sharktank#examples)iree-compile open_llama_3b_v2_f16.mlir --iree-hal-target-backends=vulkan-spirv -o /tmp/open_llama_3b_v2_f16_vulkan.vmfb
Full errors:
Click to expand full stderr output
What component(s) does this issue relate to?
Compiler
Version information
a075013
Additional context
I might be able to work around the shared memory issue with
--iree-vulkan-target-triple
matching my GPU, but we need a much better story with default flags. Haven't seen thespirv.IAdd
issue with i1 before.The text was updated successfully, but these errors were encountered: