diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp index d5ea47842..99aa479e9 100644 --- a/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp +++ b/IGC/Compiler/Optimizer/OpenCLPasses/DpasFuncs/DpasFuncsResolution.cpp @@ -107,6 +107,8 @@ private: /// static const StringRef SG_PREFIX_IDPAS16; static const StringRef SG_PREFIX_FDPAS16; + static const StringRef SG_PREFIX_IDPAS32N16; + static const StringRef SG_PREFIX_FDPAS32N16; // PVC+: pure hf/bf dpas builtins static const StringRef WI_PREFIX_HFDPAS; static const StringRef WI_PREFIX_BFDPAS; @@ -200,6 +202,8 @@ const StringRef DpasFuncsResolution::WI_PREFIX_IDPAS = "__builtin_IB_idpas"; const StringRef DpasFuncsResolution::WI_PREFIX_FDPAS = "__builtin_IB_fdpas"; const StringRef DpasFuncsResolution::SG_PREFIX_IDPAS16 = "__builtin_IB_sub_group16_idpas"; const StringRef DpasFuncsResolution::SG_PREFIX_FDPAS16 = "__builtin_IB_sub_group16_fdpas"; +const StringRef DpasFuncsResolution::SG_PREFIX_IDPAS32N16 = "__builtin_IB_sub_group32n16_idpas"; +const StringRef DpasFuncsResolution::SG_PREFIX_FDPAS32N16 = "__builtin_IB_sub_group32n16_fdpas"; // PVC+: pure hf/bf dpas builtins const StringRef DpasFuncsResolution::WI_PREFIX_HFDPAS = "__builtin_IB_hfdpas"; const StringRef DpasFuncsResolution::WI_PREFIX_BFDPAS = "__builtin_IB_bfdpas"; @@ -263,6 +267,11 @@ void DpasFuncsResolution::visitCallInst(CallInst &CI) { bool IsDpasw = false; bool IsIDpas = false; + // Dimension N is platform specific and is directly correlated to minimum subgroup-size for + // given platform. If DPAS with the same M, N, K dimensions is executed within a subgroup + // twice the size of minimum subgroup-size, each work item must contain half of the data + // compared to the minimum subgroup-size. + bool IsDoubleSubgroup = false; int DstTy, AccTy, PA, PB, SD, RC; GenISAIntrinsic::ID iid = GenISAIntrinsic::no_intrinsic; bool doVerify = false; @@ -277,12 +286,26 @@ void DpasFuncsResolution::visitCallInst(CallInst &CI) { if (!demangleSuffix(funcName, SG_PREFIX_LEN, false, IsIDpas, DstTy, AccTy, PA, PB, SD, RC, nullptr)) return; iid = GenISAIntrinsic::GenISA_sub_group_dpas; + } else if (funcName.startswith(DpasFuncsResolution::SG_PREFIX_IDPAS32N16)) { + const int SG_PREFIX_LEN = DpasFuncsResolution::SG_PREFIX_IDPAS32N16.size(); + IsIDpas = true; + IsDoubleSubgroup = true; + if (!demangleSuffix(funcName, SG_PREFIX_LEN, false, IsIDpas, DstTy, AccTy, PA, PB, SD, RC, nullptr)) + return; + iid = GenISAIntrinsic::GenISA_sub_group_dpas; } else if (funcName.startswith(DpasFuncsResolution::SG_PREFIX_FDPAS16)) { const int SG_PREFIX_LEN = DpasFuncsResolution::SG_PREFIX_FDPAS16.size(); IsIDpas = false; if (!demangleSuffix(funcName, SG_PREFIX_LEN, true, IsIDpas, DstTy, AccTy, PA, PB, SD, RC, nullptr)) return; iid = GenISAIntrinsic::GenISA_sub_group_dpas; + } else if (funcName.startswith(DpasFuncsResolution::SG_PREFIX_FDPAS32N16)) { + const int SG_PREFIX_LEN = DpasFuncsResolution::SG_PREFIX_FDPAS32N16.size(); + IsIDpas = false; + IsDoubleSubgroup = true; + if (!demangleSuffix(funcName, SG_PREFIX_LEN, true, IsIDpas, DstTy, AccTy, PA, PB, SD, RC, nullptr)) + return; + iid = GenISAIntrinsic::GenISA_sub_group_dpas; } else { return; @@ -363,6 +386,14 @@ void DpasFuncsResolution::visitCallInst(CallInst &CI) { Type *A_BaseTy = ATy->getScalarType(); Type *B_BaseTy = BTy->getScalarType(); + if (IsDoubleSubgroup) { + IGC_ASSERT_MESSAGE(RC >= 2, "ICE: repeat count of DPAS for double subgroup-size must be >= 2!"); + D_nelts *= 2; + ACC_nelts *= 2; + A_nelts *= 2; + B_nelts *= 2; + } + if (IsIDpas) { uint32_t Abits = getPrecisionInBits((PrecisionType)PA); uint32_t Bbits = getPrecisionInBits((PrecisionType)PB); diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.cpp index c8b1c69e6..ecaccc0f1 100644 --- a/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.cpp +++ b/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.cpp @@ -272,7 +272,7 @@ int SpvSubgroupMMAResolution::getElemCount(const Type *Ty) const { } bool SpvSubgroupMMAResolution::validateElemCounts(int M, int AElemCount, int BElemCount, uint32_t Operands, - const CallInst &CI) { + CallInst &CI) { if (M != 1 && M != 2 && M != 4 && M != 8) { emitError( "__spirv_SubgroupMatrixMultiplyAccumulateINTEL: M dimension must be 1, 2, 4 or 8 for targeted HW. Actual: " + @@ -295,16 +295,27 @@ bool SpvSubgroupMMAResolution::validateElemCounts(int M, int AElemCount, int BEl CI); return false; } - if (BElemCount != 8) { - emitError("__spirv_SubgroupMatrixMultiplyAccumulateINTEL: Matrix B argument must have 8 components for targeted " - "HW. Actual: " + - std::to_string(BElemCount), + const int expectedBCount = isDoubleSubgroup(CI) ? 4 : 8; + if (BElemCount != expectedBCount) { + emitError("__spirv_SubgroupMatrixMultiplyAccumulateINTEL: Matrix B argument must have " + + std::to_string(expectedBCount) + + " components for targeted HW. Actual: " + std::to_string(BElemCount), CI); return false; } return true; } +// Dimension N is platform specific and is directly correlated to minimum subgroup-size for +// given platform. If DPAS with the same M, N, K dimensions is executed within a subgroup +// twice the size of minimum subgroup-size, each work item must contain half of the data +// compared to the minimum subgroup-size. +bool SpvSubgroupMMAResolution::isDoubleSubgroup(CallInst &CI) { + if (!m_Ctx->platform.hasExecSize16DPAS()) + return false; + return IGC::getSIMDSize(getAnalysis().getMetaDataUtils(), CI.getParent()->getParent()) == 32; +} + SpvSubgroupMMAResolution::SupportedTable *SpvSubgroupMMAResolution::getSupportedTable() { if (m_Ctx->platform.hasExecSize16DPAS()) { if (m_Simd16Table.empty()) @@ -480,9 +491,16 @@ void SpvSubgroupMMAResolution::visitCallInst(CallInst &CI) { SmallVector argTypes({c->getType(), a->getType(), b->getType()}); FunctionType *FT = FunctionType::get(CI.getType(), argTypes, false); + std::string subgroupSize; + if (isDoubleSubgroup(CI)) { + subgroupSize = "32n16"; + M *= 2; + } else { + subgroupSize = m_Ctx->platform.hasExecSize16DPAS() ? "16" : ""; + } + std::stringstream newFuncName; - newFuncName << "__builtin_IB_sub_group"; - newFuncName << (m_Ctx->platform.hasExecSize16DPAS() ? "16" : ""); + newFuncName << "__builtin_IB_sub_group" << subgroupSize; newFuncName << "_" << (ResultElemTy == I32 ? "i" : "f"); newFuncName << "dpas_" << OperandsIt->second.str() << "8_" << M; diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.hpp b/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.hpp index a9213302b..5ddedf9b2 100644 --- a/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.hpp +++ b/IGC/Compiler/Optimizer/OpenCLPasses/SpvSubgroupMMAResolution/SpvSubgroupMMAResolution.hpp @@ -72,7 +72,7 @@ private: bool validateI32Constant(const llvm::Value *V, const llvm::Twine &ParamName, const llvm::CallInst &CI); bool validateCType(const llvm::Type *ResultTy, const llvm::Type *CType, const llvm::CallInst &CI); bool validateElementType(const ElType ElemTy, llvm::StringRef ParamName, const llvm::CallInst &CI); - bool validateElemCounts(int M, int AElemCount, int BElemCount, uint32_t Operands, const llvm::CallInst &CI); + bool validateElemCounts(int M, int AElemCount, int BElemCount, uint32_t Operands, llvm::CallInst &CI); template bool validateKDimInTable(const T KIt, int K, const SupportedTable *table, const llvm::CallInst &CI); @@ -89,6 +89,8 @@ private: bool validateOperands(const T OpIt, int K, ElType ResultElemTy, ElType AElemTy, ElType BElemTy, uint32_t Operands, const OperandsTable &operandMap, const llvm::CallInst &CI); + bool isDoubleSubgroup(llvm::CallInst &CI); + llvm::DenseSet m_BuiltinsToRemove; bool m_Changed = false; IGC::CodeGenContext *m_Ctx = nullptr; diff --git a/IGC/Compiler/tests/DpasFuncsResolution/dpas-pvc-simd32.ll b/IGC/Compiler/tests/DpasFuncsResolution/dpas-pvc-simd32.ll new file mode 100644 index 000000000..cb4ccf713 --- /dev/null +++ b/IGC/Compiler/tests/DpasFuncsResolution/dpas-pvc-simd32.ll @@ -0,0 +1,26 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2025 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; UNSUPPORTED: system-windows +; REQUIRES: debug, llvm-14-plus +; RUN: not igc_opt --opaque-pointers -platformpvc --igc-arith-funcs-translation -S < %s 2>&1 | FileCheck %s +; ------------------------------------------------ +; DpasFuncsResolution +; ------------------------------------------------ + +; Check assertion unique to DPAS in double-subgroup size. + +; CHECK: RC >= 2, ICE: repeat count of DPAS for double subgroup-size must be >= 2! + +define spir_kernel void @test_dpas(<4 x i32> %src, i32 %src2, ptr %dst) { + %1 = load i16, ptr %dst, align 4 + %2 = call i32 @__builtin_IB_sub_group32n16_idpas_s8_s8_8_1(i32 %src2, i16 %1, <4 x i32> %src) + store i32 %2, ptr %dst, align 4 + ret void +} + +declare i32 @__builtin_IB_sub_group32n16_idpas_s8_s8_8_1(i32, i16, <4 x i32>) diff --git a/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc.ll b/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc.ll index d7b08734b..7092a6ee0 100644 --- a/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc.ll +++ b/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc.ll @@ -10,7 +10,8 @@ ; RUN: llvm-as %s -o %t.bc ; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate -o %t.spv -; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'PrintToConsole=1 PrintAfter=ArithmeticFuncsTranslation'" 2>&1 | FileCheck %s +; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'PrintToConsole=1 PrintAfter=ArithmeticFuncsTranslation'" 2>&1 | FileCheck %s --check-prefix=CHECK-GENISA +; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'DumpVISAASMToConsole=1'" 2>&1 | FileCheck %s --check-prefix=CHECK-VISAASM target triple = "spir64-unknown-unknown" @@ -45,15 +46,41 @@ define spir_kernel void @test_v1(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v1( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 4, i32 4, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 4, i32 4, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 4, i32 4, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 4, i32 4, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v1( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 4, i32 4, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 4, i32 4, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 4, i32 4, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 4, i32 4, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v1" +; CHECK-VISAASM-DAG: dpas.s8.s8.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.s8.s8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s8.s8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s8.s8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call0 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 32, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 51) store i32 %call0, i32* %res1I32 @@ -72,15 +99,41 @@ define spir_kernel void @test_v2(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v2( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 4, i32 1, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 4, i32 1, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 4, i32 1, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 4, i32 1, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v2( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 4, i32 1, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 4, i32 1, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 4, i32 1, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 4, i32 1, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v2" +; CHECK-VISAASM-DAG: dpas.u8.s8.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.u8.s8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u8.s8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u8.s8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call4 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 32, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 49) store i32 %call4, i32* %res1I32 @@ -99,15 +152,41 @@ define spir_kernel void @test_v3(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v3( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 1, i32 4, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 1, i32 4, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 1, i32 4, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 1, i32 4, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v3( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 1, i32 4, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 1, i32 4, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 1, i32 4, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 1, i32 4, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v3" +; CHECK-VISAASM-DAG: dpas.s8.u8.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.s8.u8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s8.u8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s8.u8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call8 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 32, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 50) store i32 %call8, i32* %res1I32 @@ -126,15 +205,41 @@ define spir_kernel void @test_v4(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v4( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 1, i32 1, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 1, i32 1, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 1, i32 1, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 1, i32 1, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v4( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 1, i32 1, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 1, i32 1, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 1, i32 1, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 1, i32 1, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v4" +; CHECK-VISAASM-DAG: dpas.u8.u8.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.u8.u8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u8.u8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u8.u8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call12 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 32, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 48) store i32 %call12, i32* %res1I32 @@ -154,15 +259,41 @@ define spir_kernel void @test_v5(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v5( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 5, i32 5, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 5, i32 5, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 5, i32 5, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 5, i32 5, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v5( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 5, i32 5, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 5, i32 5, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 5, i32 5, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 5, i32 5, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v5" +; CHECK-VISAASM-DAG: dpas.s4.s4.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.s4.s4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s4.s4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s4.s4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call16 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 64, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 195) store i32 %call16, i32* %res1I32 @@ -181,15 +312,41 @@ define spir_kernel void @test_v6(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v6( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 5, i32 2, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 5, i32 2, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 5, i32 2, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 5, i32 2, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v6( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 5, i32 2, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 5, i32 2, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 5, i32 2, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 5, i32 2, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v6" +; CHECK-VISAASM-DAG: dpas.u4.s4.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.u4.s4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u4.s4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u4.s4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call20 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 64, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 193) store i32 %call20, i32* %res1I32 @@ -208,15 +365,41 @@ define spir_kernel void @test_v7(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v7( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 2, i32 5, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 2, i32 5, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 2, i32 5, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 2, i32 5, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v7( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 2, i32 5, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 2, i32 5, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 2, i32 5, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 2, i32 5, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v7" +; CHECK-VISAASM-DAG: dpas.s4.u4.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.s4.u4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s4.u4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s4.u4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call24 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 64, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 194) store i32 %call24, i32* %res1I32 @@ -230,20 +413,47 @@ entry: ret void } +; int4 matrix sources, fp32 accumulator: define spir_kernel void @test_v8(i32* %res1I32, <2 x i32>* %res2I32, <4 x i32>* %res4I32, <8 x i32>* %res8I32, i16 %a1, <2 x i16> %a2, <4 x i16> %a4, <8 x i16> %a8, <8 x i32> %b, i32 %c1I32, <2 x i32> %c2I32, <4 x i32> %c4I32, <8 x i32> %c8I32) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v8( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 2, i32 2, i32 8, i32 1, i1 false) -; CHECK: store i32 [[DPAS]], i32* %res1I32 -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 2, i32 2, i32 8, i32 2, i1 false) -; CHECK: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 2, i32 2, i32 8, i32 4, i1 false) -; CHECK: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 2, i32 2, i32 8, i32 8, i1 false) -; CHECK: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 +; CHECK-GENISA-LABEL: @test_v8( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v8i32(i32 %c1I32, i16 %a1, <8 x i32> %b, i32 2, i32 2, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i32 [[DPAS]], i32* %res1I32 +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v8i32(<2 x i32> %c2I32, <2 x i16> %a2, <8 x i32> %b, i32 2, i32 2, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS1]], <2 x i32>* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v8i32(<4 x i32> %c4I32, <4 x i16> %a4, <8 x i32> %b, i32 2, i32 2, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS2]], <4 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8i32.v8i32.v8i16.v8i32(<8 x i32> %c8I32, <8 x i16> %a8, <8 x i32> %b, i32 2, i32 2, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i32> [[DPAS3]], <8 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v8" +; CHECK-VISAASM-DAG: dpas.u4.u4.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=d num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.u4.u4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u4.u4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u4.u4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call28 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iii(i32 64, i16 %a1, <8 x i32> %b, i32 %c1I32, i32 192) store i32 %call28, i32* %res1I32 @@ -263,15 +473,41 @@ define spir_kernel void @test_v9(float* %resF, <2 x float>* %res2, <4 x float>* <8 x i32> %b, float %cF, <2 x float> %c2F, <4 x float> %c4F, <8 x float> %c8F) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v9( -; CHECK: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v8i32(float %cF, i16 %a1, <8 x i32> %b, i32 12, i32 12, i32 8, i32 1, i1 false) -; CHECK: store float [[DPAS]], float* %resF -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v8i32(<2 x float> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) -; CHECK: store <2 x float> [[DPAS1]], <2 x float>* %res2 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v8i32(<4 x float> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) -; CHECK: store <4 x float> [[DPAS2]], <4 x float>* %res4 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) -; CHECK: store <8 x float> [[DPAS3]], <8 x float>* %res8 +; CHECK-GENISA-LABEL: @test_v9( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v8i32(float %cF, i16 %a1, <8 x i32> %b, i32 12, i32 12, i32 8, i32 1, i1 false) +; CHECK-GENISA: store float [[DPAS]], float* %resF +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v8i32(<2 x float> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS1]], <2 x float>* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v8i32(<4 x float> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS2]], <4 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x float> [[DPAS3]], <8 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v9" +; CHECK-VISAASM-DAG: dpas.hf.hf.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call32 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_ifi(i32 16, i16 %a1, <8 x i32> %b, float %cF, i32 3072) store float %call32, float* %resF @@ -291,15 +527,41 @@ define spir_kernel void @test_v10(float* %resF, <2 x float>* %res2, <4 x float> <8 x i32> %b, float %cF, <2 x float> %c2F, <4 x float> %c4F, <8 x float> %c8F) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v10( -; CHECK: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v8i32(float %cF, i16 %a1, <8 x i32> %b, i32 11, i32 11, i32 8, i32 1, i1 false) -; CHECK: store float [[DPAS]], float* %resF -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v8i32(<2 x float> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) -; CHECK: store <2 x float> [[DPAS1]], <2 x float>* %res2 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v8i32(<4 x float> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) -; CHECK: store <4 x float> [[DPAS2]], <4 x float>* %res4 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) -; CHECK: store <8 x float> [[DPAS3]], <8 x float>* %res8 +; CHECK-GENISA-LABEL: @test_v10( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v8i32(float %cF, i16 %a1, <8 x i32> %b, i32 11, i32 11, i32 8, i32 1, i1 false) +; CHECK-GENISA: store float [[DPAS]], float* %resF +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v8i32(<2 x float> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS1]], <2 x float>* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v8i32(<4 x float> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS2]], <4 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x float> [[DPAS3]], <8 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v10" +; CHECK-VISAASM-DAG: dpas.bf.bf.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call36 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_ifi(i32 16, i16 %a1, <8 x i32> %b, float %cF, i32 12288) store float %call36, float* %resF @@ -319,15 +581,41 @@ define spir_kernel void @test_v11(half* %res, <2 x half>* %res2, <4 x half>* %r <8 x i32> %b, half %c, <2 x half> %c2, <4 x half> %c4, <8 x half> %c8) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v11( -; CHECK: [[DPAS:%[A-z0-9]*]] = call half @llvm.genx.GenISA.sub.group.dpas.f16.f16.i16.v8i32(half %c, i16 %a1, <8 x i32> %b, i32 12, i32 12, i32 8, i32 1, i1 false) -; CHECK: store half [[DPAS]], half* %res -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x half> @llvm.genx.GenISA.sub.group.dpas.v2f16.v2f16.v2i16.v8i32(<2 x half> %c2, <2 x i16> %a2, <8 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) -; CHECK: store <2 x half> [[DPAS1]], <2 x half>* %res2 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x half> @llvm.genx.GenISA.sub.group.dpas.v4f16.v4f16.v4i16.v8i32(<4 x half> %c4, <4 x i16> %a4, <8 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) -; CHECK: store <4 x half> [[DPAS2]], <4 x half>* %res4 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x half> @llvm.genx.GenISA.sub.group.dpas.v8f16.v8f16.v8i16.v8i32(<8 x half> %c8, <8 x i16> %a8, <8 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) -; CHECK: store <8 x half> [[DPAS3]], <8 x half>* %res8 +; CHECK-GENISA-LABEL: @test_v11( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call half @llvm.genx.GenISA.sub.group.dpas.f16.f16.i16.v8i32(half %c, i16 %a1, <8 x i32> %b, i32 12, i32 12, i32 8, i32 1, i1 false) +; CHECK-GENISA: store half [[DPAS]], half* %res +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x half> @llvm.genx.GenISA.sub.group.dpas.v2f16.v2f16.v2i16.v8i32(<2 x half> %c2, <2 x i16> %a2, <8 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x half> [[DPAS1]], <2 x half>* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x half> @llvm.genx.GenISA.sub.group.dpas.v4f16.v4f16.v4i16.v8i32(<4 x half> %c4, <4 x i16> %a4, <8 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x half> [[DPAS2]], <4 x half>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x half> @llvm.genx.GenISA.sub.group.dpas.v8f16.v8f16.v8i16.v8i32(<8 x half> %c8, <8 x i16> %a8, <8 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x half> [[DPAS3]], <8 x half>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v11" +; CHECK-VISAASM-DAG: dpas.hf.hf.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=hf num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=hf num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=hf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=hf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=hf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=hf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=hf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=hf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call32 = call spir_func half @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_iDhi(i32 16, i16 %a1, <8 x i32> %b, half %c, i32 3072) store half %call32, half* %res @@ -347,15 +635,41 @@ define spir_kernel void @test_v12(i16* %res, <2 x i16>* %res2, <4 x i16>* %res4, <8 x i32> %b, i16 %cF, <2 x i16> %c2F, <4 x i16> %c4F, <8 x i16> %c8F) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v12( -; CHECK: [[DPAS:%[A-z0-9]*]] = call i16 @llvm.genx.GenISA.sub.group.dpas.i16.i16.i16.v8i32(i16 %cF, i16 %a1, <8 x i32> %b, i32 11, i32 11, i32 8, i32 1, i1 false) -; CHECK: store i16 [[DPAS]], i16* %res -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x i16> @llvm.genx.GenISA.sub.group.dpas.v2i16.v2i16.v2i16.v8i32(<2 x i16> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) -; CHECK: store <2 x i16> [[DPAS1]], <2 x i16>* %res2 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x i16> @llvm.genx.GenISA.sub.group.dpas.v4i16.v4i16.v4i16.v8i32(<4 x i16> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) -; CHECK: store <4 x i16> [[DPAS2]], <4 x i16>* %res4 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x i16> @llvm.genx.GenISA.sub.group.dpas.v8i16.v8i16.v8i16.v8i32(<8 x i16> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) -; CHECK: store <8 x i16> [[DPAS3]], <8 x i16>* %res8 +; CHECK-GENISA-LABEL: @test_v12( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call i16 @llvm.genx.GenISA.sub.group.dpas.i16.i16.i16.v8i32(i16 %cF, i16 %a1, <8 x i32> %b, i32 11, i32 11, i32 8, i32 1, i1 false) +; CHECK-GENISA: store i16 [[DPAS]], i16* %res +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x i16> @llvm.genx.GenISA.sub.group.dpas.v2i16.v2i16.v2i16.v8i32(<2 x i16> %c2F, <2 x i16> %a2, <8 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x i16> [[DPAS1]], <2 x i16>* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x i16> @llvm.genx.GenISA.sub.group.dpas.v4i16.v4i16.v4i16.v8i32(<4 x i16> %c4F, <4 x i16> %a4, <8 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x i16> [[DPAS2]], <4 x i16>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x i16> @llvm.genx.GenISA.sub.group.dpas.v8i16.v8i16.v8i16.v8i32(<8 x i16> %c8F, <8 x i16> %a8, <8 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x i16> [[DPAS3]], <8 x i16>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v12" +; CHECK-VISAASM-DAG: dpas.bf.bf.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=bf num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=bf num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=8 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=w num_elts=16 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=bf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=bf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=bf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=bf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=bf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=bf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 %call36 = call spir_func i16 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv8_isi(i32 16, i16 %a1, <8 x i32> %b, i16 %cF, i32 12300) store i16 %call36, i16* %res @@ -375,15 +689,41 @@ define spir_kernel void @test_v13(float* %resF, <2 x float>* %res2, <4 x float> <8 x float> %b, float %cF, <2 x float> %c2F, <4 x float> %c4F, <8 x float> %c8F) !intel_reqd_sub_group_size !100 { entry: -; CHECK-LABEL: @test_v13( -; CHECK: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.f32.v8i32(float %cF, float %a1, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 1, i1 false) -; CHECK: store float [[DPAS]], float* %resF -; CHECK: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.f32.v8i32(<2 x float> %c2F, float %a2, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 2, i1 false) -; CHECK: store <2 x float> [[DPAS1]], <2 x float>* %res2 -; CHECK: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v2f32.v8i32(<4 x float> %c4F, <2 x float> %a4, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 4, i1 false) -; CHECK: store <4 x float> [[DPAS2]], <4 x float>* %res4 -; CHECK: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v4f32.v8i32(<8 x float> %c8F, <4 x float> %a8, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 8, i1 false) -; CHECK: store <8 x float> [[DPAS3]], <8 x float>* %res8 +; CHECK-GENISA-LABEL: @test_v13( +; CHECK-GENISA: [[DPAS:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.f32.v8i32(float %cF, float %a1, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 1, i1 false) +; CHECK-GENISA: store float [[DPAS]], float* %resF +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.f32.v8i32(<2 x float> %c2F, float %a2, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 2, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS1]], <2 x float>* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v2f32.v8i32(<4 x float> %c4F, <2 x float> %a4, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS2]], <4 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v4f32.v8i32(<8 x float> %c8F, <4 x float> %a8, <8 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <8 x float> [[DPAS3]], <8 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v13" +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.1 (M1, 16) [[D1:[A-z0-9_]*]].0 [[C1:[A-z0-9_]*]].0 [[B1:[A-z0-9_]*]].0 [[A1:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[C1]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: .decl [[B1]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A1]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A1_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A1_ALIAS]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=f num_elts=16 +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=f num_elts=64 %call32 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELifDv8_ffi(i32 8, float %a1, <8 x float> %b, float %cF, i32 768) store float %call32, float* %resF diff --git a/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc_simd32.ll b/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc_simd32.ll new file mode 100644 index 000000000..1386f57d5 --- /dev/null +++ b/IGC/ocloc_tests/SPIRV-extenstions/SPV_INTEL_subgroup_matrix_multiply_accumulate/dpas_pvc_simd32.ll @@ -0,0 +1,610 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2025 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= + +; REQUIRES: llvm-spirv, pvc-supported + +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_subgroup_matrix_multiply_accumulate -o %t.spv +; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'PrintToConsole=1 PrintAfter=ArithmeticFuncsTranslation'" 2>&1 | FileCheck %s --check-prefix=CHECK-GENISA +; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'DumpVISAASMToConsole=1'" 2>&1 | FileCheck %s --check-prefix=CHECK-VISAASM + +; Tests DPAS called from subgroup-size=32 kernels. +; Supported dimensions M, N, K are exactly the same and don't change with subgroup-size. +; To correctly map 32 work-items, each work item contains half of the data compared to subgroup-size=16. +; +; This test is copy of SIMD16 test (dpas_pvc.ll) with modified GenISA checks. vISA ASM checks are exactly the same. + +target triple = "spir64-unknown-unknown" + +declare spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32, i16 signext, <4 x i32>, i32, i32) +declare spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32, <2 x i16>, <4 x i32>, <2 x i32>, i32) +declare spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32, <4 x i16>, <4 x i32>, <4 x i32>, i32) + +declare spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_ifi(i32, i16 signext, <4 x i32>, float, i32) +declare spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_fi(i32, <2 x i16>, <4 x i32>, <2 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iDv4_fi(i32, <4 x i16>, <4 x i32>, <4 x float>, i32) + +declare spir_func half @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iDhi(i32, i16 signext, <4 x i32>, half, i32) +declare spir_func <2 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_Dhi(i32, <2 x i16>, <4 x i32>, <2 x half>, i32) +declare spir_func <4 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iDv4_Dhi(i32, <4 x i16>, <4 x i32>, <4 x half>, i32) + +declare spir_func signext i16 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_isi(i32, i16 signext, <4 x i32>, i16 signext, i32) +declare spir_func <2 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iS_i(i32, <2 x i16>, <4 x i32>, <2 x i16>, i32) +declare spir_func <4 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS_i(i32, <4 x i16>, <4 x i32>, <4 x i16>, i32) + +declare spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELifDv4_ffi(i32, float, <4 x float>, float, i32) +declare spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_fDv4_fS_i(i32, float, <4 x float>, <2 x float>, i32) +declare spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fS_S_i(i32, <2 x float>, <4 x float>, <4 x float>, i32) + +; 8-bit integer matrix sources (signed and unsigned), 32-bit integer accumulator: +define spir_kernel void @test_v1(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v1( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 4, i32 4, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 4, i32 4, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 4, i32 4, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v1" +; CHECK-VISAASM-DAG: dpas.s8.s8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s8.s8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s8.s8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call1 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 32, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 51) + store i32 %call1, i32* %res2I32 + %call2 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 32, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 51) + store <2 x i32> %call2, <2 x i32>* %res4I32 + %call3 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 32, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 51) + store <4 x i32> %call3, <4 x i32>* %res8I32 + + ret void +} + +define spir_kernel void @test_v2(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v2( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 4, i32 1, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 4, i32 1, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 4, i32 1, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v2" +; CHECK-VISAASM-DAG: dpas.u8.s8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u8.s8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u8.s8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call5 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 32, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 49) + store i32 %call5, i32* %res2I32 + %call6 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 32, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 49) + store <2 x i32> %call6, <2 x i32>* %res4I32 + %call7 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 32, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 49) + store <4 x i32> %call7, <4 x i32>* %res8I32 + + ret void +} + +define spir_kernel void @test_v3(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v3( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 1, i32 4, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 1, i32 4, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 1, i32 4, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-DAG: dpas.s8.u8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s8.u8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s8.u8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call9 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 32, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 50) + store i32 %call9, i32* %res2I32 + %call10 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 32, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 50) + store <2 x i32> %call10, <2 x i32>* %res4I32 + %call11 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 32, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 50) + store <4 x i32> %call11, <4 x i32>* %res8I32 + + ret void +} + +define spir_kernel void @test_v4(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v4( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 1, i32 1, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 1, i32 1, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 1, i32 1, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v4" +; CHECK-VISAASM-DAG: dpas.u8.u8.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u8.u8.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u8.u8.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call13 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 32, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 48) + store i32 %call13, i32* %res2I32 + %call14 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 32, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 48) + store <2 x i32> %call14, <2 x i32>* %res4I32 + %call15 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 32, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 48) + store <4 x i32> %call15, <4 x i32>* %res8I32 + + ret void +} + +; 4-bit integer matrix sources (signed and unsigned), 32-bit integer accumulator: +define spir_kernel void @test_v5(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v5( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 5, i32 5, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 5, i32 5, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 5, i32 5, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v5" +; CHECK-VISAASM-DAG: dpas.s4.s4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s4.s4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s4.s4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call17 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 64, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 195) + store i32 %call17, i32* %res2I32 + %call18 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 64, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 195) + store <2 x i32> %call18, <2 x i32>* %res4I32 + %call19 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 64, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 195) + store <4 x i32> %call19, <4 x i32>* %res8I32 + + ret void +} + +define spir_kernel void @test_v6(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v6( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 5, i32 2, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 5, i32 2, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 5, i32 2, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v6" +; CHECK-VISAASM-DAG: dpas.u4.s4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u4.s4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u4.s4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call21 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 64, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 193) + store i32 %call21, i32* %res2I32 + %call22 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 64, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 193) + store <2 x i32> %call22, <2 x i32>* %res4I32 + %call23 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 64, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 193) + store <4 x i32> %call23, <4 x i32>* %res8I32 + + ret void +} + +define spir_kernel void @test_v7(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v7( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 2, i32 5, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 2, i32 5, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 2, i32 5, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v7" +; CHECK-VISAASM-DAG: dpas.s4.u4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.s4.u4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.s4.u4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call25 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 64, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 194) + store i32 %call25, i32* %res2I32 + %call26 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 64, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 194) + store <2 x i32> %call26, <2 x i32>* %res4I32 + %call27 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 64, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 194) + store <4 x i32> %call27, <4 x i32>* %res8I32 + + ret void +} + +; int4 matrix sources, fp32 accumulator: +define spir_kernel void @test_v8(i32* %res1I32, i32* %res2I32, <2 x i32>* %res4I32, <4 x i32>* %res8I32, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i32 %c1I32, i32 %c2I32, <2 x i32> %c4I32, <4 x i32> %c8I32) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v8( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i32 @llvm.genx.GenISA.sub.group.dpas.i32.i32.i16.v4i32(i32 %c2I32, i16 %a2, <4 x i32> %b, i32 2, i32 2, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i32 [[DPAS1]], i32* %res2I32 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i32> @llvm.genx.GenISA.sub.group.dpas.v2i32.v2i32.v2i16.v4i32(<2 x i32> %c4I32, <2 x i16> %a4, <4 x i32> %b, i32 2, i32 2, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i32> [[DPAS2]], <2 x i32>* %res4I32 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i32> @llvm.genx.GenISA.sub.group.dpas.v4i32.v4i32.v4i16.v4i32(<4 x i32> %c8I32, <4 x i16> %a8, <4 x i32> %b, i32 2, i32 2, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i32> [[DPAS3]], <4 x i32>* %res8I32 + +; CHECK-VISAASM-LABEL: .kernel "test_v8" +; CHECK-VISAASM-DAG: dpas.u4.u4.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=d num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.u4.u4.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=d num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.u4.u4.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call29 = call spir_func i32 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iii(i32 64, i16 %a2, <4 x i32> %b, i32 %c2I32, i32 192) + store i32 %call29, i32* %res2I32 + %call30 = call spir_func <2 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_ii(i32 64, <2 x i16> %a4, <4 x i32> %b, <2 x i32> %c4I32, i32 192) + store <2 x i32> %call30, <2 x i32>* %res4I32 + %call31 = call spir_func <4 x i32> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS0_i(i32 64, <4 x i16> %a8, <4 x i32> %b, <4 x i32> %c8I32, i32 192) + store <4 x i32> %call31, <4 x i32>* %res8I32 + + ret void +} + +; fp16 matrix sources, fp32 accumulator: +define spir_kernel void @test_v9(float* %resF, float* %res2, <2 x float>* %res4, <4 x float>* %res8, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + float %cF, float %c2F, <2 x float> %c4F, <4 x float> %c8F) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v9( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v4i32(float %c2F, i16 %a2, <4 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) +; CHECK-GENISA: store float [[DPAS1]], float* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v4i32(<2 x float> %c4F, <2 x i16> %a4, <4 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS2]], <2 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %c8F, <4 x i16> %a8, <4 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS3]], <4 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v9" +; CHECK-VISAASM-DAG: dpas.hf.hf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call33 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_ifi(i32 16, i16 %a2, <4 x i32> %b, float %c2F, i32 3072) + store float %call33, float* %res2 + %call34 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_fi(i32 16, <2 x i16> %a4, <4 x i32> %b, <2 x float> %c4F, i32 3072) + store <2 x float> %call34, <2 x float>* %res4 + %call35 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iDv4_fi(i32 16, <4 x i16> %a8, <4 x i32> %b, <4 x float> %c8F, i32 3072) + store <4 x float> %call35, <4 x float>* %res8 + + ret void +} + +; bf16 matrix sources, fp32 accumulator: +define spir_kernel void @test_v10(float* %resF, float* %res2, <2 x float>* %res4, <4 x float>* %res8, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + float %cF, float %c2F, <2 x float> %c4F, <4 x float> %c8F) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v10( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.i16.v4i32(float %c2F, i16 %a2, <4 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) +; CHECK-GENISA: store float [[DPAS1]], float* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.v2i16.v4i32(<2 x float> %c4F, <2 x i16> %a4, <4 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS2]], <2 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %c8F, <4 x i16> %a8, <4 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS3]], <4 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v10" +; CHECK-VISAASM-DAG: dpas.bf.bf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call37 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_ifi(i32 16, i16 %a2, <4 x i32> %b, float %c2F, i32 12288) + store float %call37, float* %res2 + %call38 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_fi(i32 16, <2 x i16> %a4, <4 x i32> %b, <2 x float> %c4F, i32 12288) + store <2 x float> %call38, <2 x float>* %res4 + %call39 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iDv4_fi(i32 16, <4 x i16> %a8, <4 x i32> %b, <4 x float> %c8F, i32 12288) + store <4 x float> %call39, <4 x float>* %res8 + + ret void +} + +; fp16 matrix sources, fp16 accumulator: +define spir_kernel void @test_v11(half* %res, half* %res2, <2 x half>* %res4, <4 x half>* %res8, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + half %c, half %c2, <2 x half> %c4, <4 x half> %c8) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v11( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call half @llvm.genx.GenISA.sub.group.dpas.f16.f16.i16.v4i32(half %c2, i16 %a2, <4 x i32> %b, i32 12, i32 12, i32 8, i32 2, i1 false) +; CHECK-GENISA: store half [[DPAS1]], half* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x half> @llvm.genx.GenISA.sub.group.dpas.v2f16.v2f16.v2i16.v4i32(<2 x half> %c4, <2 x i16> %a4, <4 x i32> %b, i32 12, i32 12, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x half> [[DPAS2]], <2 x half>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x half> @llvm.genx.GenISA.sub.group.dpas.v4f16.v4f16.v4i16.v4i32(<4 x half> %c8, <4 x i16> %a8, <4 x i32> %b, i32 12, i32 12, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x half> [[DPAS3]], <4 x half>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v11" +; CHECK-VISAASM-DAG: dpas.hf.hf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=hf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=hf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=hf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=hf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.hf.hf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=hf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=hf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call33 = call spir_func half @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_iDhi(i32 16, i16 %a2, <4 x i32> %b, half %c2, i32 3072) + store half %call33, half* %res2 + %call34 = call spir_func <2 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iDv2_Dhi(i32 16, <2 x i16> %a4, <4 x i32> %b, <2 x half> %c4, i32 3072) + store <2 x half> %call34, <2 x half>* %res4 + %call35 = call spir_func <4 x half> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iDv4_Dhi(i32 16, <4 x i16> %a8, <4 x i32> %b, <4 x half> %c8, i32 3072) + store <4 x half> %call35, <4 x half>* %res8 + + ret void +} + +; bf16 matrix sources, bf16 accumulator: +define spir_kernel void @test_v12(i16* %res, i16* %res2, <2 x i16>* %res4, <4 x i16>* %res8, + i16 %a1, i16 %a2, <2 x i16> %a4, <4 x i16> %a8, + <4 x i32> %b, + i16 %cF, i16 %c2F, <2 x i16> %c4F, <4 x i16> %c8F) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v12( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call i16 @llvm.genx.GenISA.sub.group.dpas.i16.i16.i16.v4i32(i16 %c2F, i16 %a2, <4 x i32> %b, i32 11, i32 11, i32 8, i32 2, i1 false) +; CHECK-GENISA: store i16 [[DPAS1]], i16* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x i16> @llvm.genx.GenISA.sub.group.dpas.v2i16.v2i16.v2i16.v4i32(<2 x i16> %c4F, <2 x i16> %a4, <4 x i32> %b, i32 11, i32 11, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x i16> [[DPAS2]], <2 x i16>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x i16> @llvm.genx.GenISA.sub.group.dpas.v4i16.v4i16.v4i16.v4i32(<4 x i16> %c8F, <4 x i16> %a8, <4 x i32> %b, i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x i16> [[DPAS3]], <4 x i16>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v12" +; CHECK-VISAASM-DAG: dpas.bf.bf.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=bf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=bf num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=16 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=w num_elts=32 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=bf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=bf num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=w num_elts=64 +; CHECK-VISAASM-DAG: dpas.bf.bf.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=bf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=bf num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=w num_elts=128 + + %call37 = call spir_func i16 @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELisDv4_isi(i32 16, i16 %a2, <4 x i32> %b, i16 %c2F, i32 12300) + store i16 %call37, i16* %res2 + %call38 = call spir_func <2 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_sDv4_iS_i(i32 16, <2 x i16> %a4, <4 x i32> %b, <2 x i16> %c4F, i32 12300) + store <2 x i16> %call38, <2 x i16>* %res4 + %call39 = call spir_func <4 x i16> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_sDv4_iS_i(i32 16, <4 x i16> %a8, <4 x i32> %b, <4 x i16> %c8F, i32 12300) + store <4 x i16> %call39, <4 x i16>* %res8 + + ret void +} + +; tf32 matrix sources, fp32 accumulator: +define spir_kernel void @test_v13(float* %resF, float* %res2, <2 x float>* %res4, <4 x float>* %res8, + float %a1, float %a2, float %a4, <2 x float> %a8, + <4 x float> %b, + float %cF, float %c2F, <2 x float> %c4F, <4 x float> %c8F) !intel_reqd_sub_group_size !100 { +entry: +; CHECK-GENISA-LABEL: @test_v13( +; CHECK-GENISA: [[DPAS1:%[A-z0-9]*]] = call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.f32.v4i32(float %c2F, float %a2, <4 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 2, i1 false) +; CHECK-GENISA: store float [[DPAS1]], float* %res2 +; CHECK-GENISA: [[DPAS2:%[A-z0-9]*]] = call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.f32.v4i32(<2 x float> %c4F, float %a4, <4 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 4, i1 false) +; CHECK-GENISA: store <2 x float> [[DPAS2]], <2 x float>* %res4 +; CHECK-GENISA: [[DPAS3:%[A-z0-9]*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v2f32.v4i32(<4 x float> %c8F, <2 x float> %a8, <4 x i32> %{{.*}}, i32 10, i32 10, i32 8, i32 8, i1 false) +; CHECK-GENISA: store <4 x float> [[DPAS3]], <4 x float>* %res8 + +; CHECK-VISAASM-LABEL: .kernel "test_v13" +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.2 (M1, 16) [[D2:[A-z0-9_]*]].0 [[C2:[A-z0-9_]*]].0 [[B2:[A-z0-9_]*]].0 [[A2:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[C2]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: .decl [[B2]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A2]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A2_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A2_ALIAS]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.4 (M1, 16) [[D4:[A-z0-9_]*]].0 [[C4:[A-z0-9_]*]].0 [[B4:[A-z0-9_]*]].0 [[A4:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[C4]] v_type=G type=f num_elts=64 +; CHECK-VISAASM-DAG: .decl [[B4]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A4]] v_type=G type=ud num_elts=32 align=wordx32 alias=<[[A4_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A4_ALIAS]] v_type=G type=f num_elts=32 +; CHECK-VISAASM-DAG: dpas.tf32.tf32.8.8 (M1, 16) [[D8:[A-z0-9_]*]].0 [[C8:[A-z0-9_]*]].0 [[B8:[A-z0-9_]*]].0 [[A8:[A-z0-9_]*]](0,0) +; CHECK-VISAASM-DAG: .decl [[D8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[C8]] v_type=G type=f num_elts=128 +; CHECK-VISAASM-DAG: .decl [[B8]] v_type=G type=d num_elts=128 +; CHECK-VISAASM-DAG: .decl [[A8]] v_type=G type=ud num_elts=64 align=wordx32 alias=<[[A8_ALIAS:[A-z0-9_]*]], 0> +; CHECK-VISAASM-DAG: .decl [[A8_ALIAS]] v_type=G type=f num_elts=64 + + %call33 = call spir_func float @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELifDv4_ffi(i32 8, float %a2, <4 x float> %b, float %c2F, i32 768) + store float %call33, float* %res2 + %call34 = call spir_func <2 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv2_fDv4_fS_i(i32 8, float %a4, <4 x float> %b, <2 x float> %c4F, i32 768) + store <2 x float> %call34, <2 x float>* %res4 + %call35 = call spir_func <4 x float> @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv4_fS_S_i(i32 8, <2 x float> %a8, <4 x float> %b, <4 x float> %c8F, i32 768) + store <4 x float> %call35, <4 x float>* %res8 + + ret void +} + +!100 = !{i32 32}