mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
fix: Unify logic calculating threads per work group part 3
Related-To: NEO-8087 Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
d2f1cf98d7
commit
61f701aba5
@@ -345,7 +345,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
|||||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
simdSize, grfSize, numChannels, itemsInGroup));
|
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper));
|
||||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||||
alignedFree(perThreadDataForWholeThreadGroup);
|
alignedFree(perThreadDataForWholeThreadGroup);
|
||||||
|
|||||||
@@ -991,7 +991,8 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
|||||||
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
|
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
|
||||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||||
|
|
||||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor);
|
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||||
|
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||||
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
||||||
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize);
|
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize);
|
||||||
@@ -1027,7 +1028,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
@@ -1073,7 +1074,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
|||||||
@@ -2062,7 +2062,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
|
|||||||
template <typename FamilyType>
|
template <typename FamilyType>
|
||||||
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
|
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
|
||||||
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
||||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor());
|
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||||
|
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||||
return implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
return implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||||
} else {
|
} else {
|
||||||
return 0u;
|
return 0u;
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
|||||||
const Kernel &kernel);
|
const Kernel &kernel);
|
||||||
static size_t getSizeRequiredIOH(
|
static size_t getSizeRequiredIOH(
|
||||||
const Kernel &kernel,
|
const Kernel &kernel,
|
||||||
size_t localWorkSize = 256);
|
const size_t localWorkSizes[3]);
|
||||||
static size_t getSizeRequiredSSH(
|
static size_t getSizeRequiredSSH(
|
||||||
const Kernel &kernel);
|
const Kernel &kernel);
|
||||||
|
|
||||||
|
|||||||
@@ -45,20 +45,33 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(const Kernel &kerne
|
|||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
|
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
|
||||||
size_t localWorkSize) {
|
const size_t localWorkSizes[3]) {
|
||||||
|
auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes);
|
||||||
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
|
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
|
||||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||||
const auto &hwInfo = kernel.getHardwareInfo();
|
const auto &hwInfo = kernel.getHardwareInfo();
|
||||||
|
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
||||||
|
|
||||||
auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
uint32_t grfSize = hwInfo.capabilityTable.grfSize;
|
uint32_t grfSize = hwInfo.capabilityTable.grfSize;
|
||||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||||
|
uint32_t requiredWalkOrder = 0u;
|
||||||
|
auto isHwLocalIdGeneration = !NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||||
|
numChannels,
|
||||||
|
localWorkSizes,
|
||||||
|
std::array<uint8_t, 3>{
|
||||||
|
{kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
|
||||||
|
kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
|
||||||
|
kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
|
||||||
|
kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
|
||||||
|
requiredWalkOrder,
|
||||||
|
simdSize);
|
||||||
auto size = kernel.getCrossThreadDataSize() +
|
auto size = kernel.getCrossThreadDataSize() +
|
||||||
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize);
|
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, gfxCoreHelper);
|
||||||
|
|
||||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor);
|
size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
||||||
}
|
}
|
||||||
return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
}
|
}
|
||||||
@@ -94,7 +107,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
|
|||||||
const MultiDispatchInfo &multiDispatchInfo) {
|
const MultiDispatchInfo &multiDispatchInfo) {
|
||||||
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
|
||||||
*dispatchInfo.getKernel(),
|
*dispatchInfo.getKernel(),
|
||||||
Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
|
dispatchInfo.getLocalWorkgroupSize().values); });
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
|
|||||||
@@ -87,11 +87,13 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|||||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||||
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor);
|
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
||||||
|
auto isHwLocalIdGeneration = false;
|
||||||
|
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
||||||
|
|
||||||
auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed();
|
auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed();
|
||||||
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
||||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
|
||||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper);
|
||||||
|
|
||||||
auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast<uint64_t *>(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast<uint64_t *>(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||||
|
|||||||
@@ -76,12 +76,6 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|||||||
pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
|
pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
|
||||||
|
|
||||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||||
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor);
|
|
||||||
|
|
||||||
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs);
|
|
||||||
offsetCrossThreadData += sizeForLocalIdsProgramming;
|
|
||||||
|
|
||||||
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
|
||||||
|
|
||||||
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
|
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
|
||||||
uint32_t requiredWalkOrder = 0u;
|
uint32_t requiredWalkOrder = 0u;
|
||||||
@@ -96,7 +90,15 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|||||||
kernelAttributes.flags.requiresWorkgroupWalkOrder,
|
kernelAttributes.flags.requiresWorkgroupWalkOrder,
|
||||||
requiredWalkOrder,
|
requiredWalkOrder,
|
||||||
kernelDescriptor.kernelAttributes.simdSize);
|
kernelDescriptor.kernelAttributes.simdSize);
|
||||||
|
|
||||||
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
const auto &gfxCoreHelper = kernel.getGfxCoreHelper();
|
||||||
|
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||||
|
|
||||||
|
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs);
|
||||||
|
offsetCrossThreadData += sizeForLocalIdsProgramming;
|
||||||
|
|
||||||
|
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
|
||||||
|
|
||||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
|
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -741,10 +741,8 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
|
|||||||
CsrDependencies(),
|
CsrDependencies(),
|
||||||
walkerArgs);
|
walkerArgs);
|
||||||
|
|
||||||
Vec3<size_t> localWorkgroupSize(workGroupSize);
|
|
||||||
|
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernel, Math::computeTotalElementsCount(localWorkgroupSize));
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernel, workGroupSize);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
|
||||||
|
|
||||||
EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
|
EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
|
||||||
@@ -1433,7 +1431,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
|||||||
size_t workItems[3] = {1, 1, 1};
|
size_t workItems[3] = {1, 1, 1};
|
||||||
size_t workGroupSize[3] = {2, 5, 10};
|
size_t workGroupSize[3] = {2, 5, 10};
|
||||||
cl_uint dimensions = 1;
|
cl_uint dimensions = 1;
|
||||||
Vec3<size_t> localWorkgroupSize(workGroupSize);
|
|
||||||
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
||||||
|
|
||||||
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u;
|
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u;
|
||||||
@@ -1458,7 +1456,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
|||||||
CsrDependencies(),
|
CsrDependencies(),
|
||||||
walkerArgsWithoutImplicitArgs);
|
walkerArgsWithoutImplicitArgs);
|
||||||
|
|
||||||
auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithoutImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize));
|
auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize);
|
||||||
|
|
||||||
DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast<MockKernel *>(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets);
|
DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast<MockKernel *>(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets);
|
||||||
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1});
|
||||||
@@ -1473,7 +1471,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
|||||||
CsrDependencies(),
|
CsrDependencies(),
|
||||||
walkerArgsWithImplicitArgs);
|
walkerArgsWithImplicitArgs);
|
||||||
|
|
||||||
auto iohSizeWithImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize));
|
auto iohSizeWithImplicitArgs = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize);
|
||||||
|
|
||||||
EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs);
|
EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs);
|
||||||
|
|
||||||
@@ -1481,9 +1479,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
|||||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
auto simdSize = kernelInfo.getMaxSimdSize();
|
auto simdSize = kernelInfo.getMaxSimdSize();
|
||||||
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||||
|
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||||
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
||||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) +
|
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, gfxCoreHelper) +
|
||||||
ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor());
|
ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, gfxCoreHelper);
|
||||||
|
|
||||||
size = alignUp(size, MemoryConstants::cacheLineSize);
|
size = alignUp(size, MemoryConstants::cacheLineSize);
|
||||||
EXPECT_EQ(size, iohSizeWithImplicitArgs);
|
EXPECT_EQ(size, iohSizeWithImplicitArgs);
|
||||||
|
|||||||
@@ -499,7 +499,8 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]);
|
size_t localWorkSizes[] = {64, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -538,7 +539,8 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]);
|
size_t localWorkSizes[] = {64, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
|
||||||
|
|
||||||
EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
|
|||||||
@@ -96,7 +96,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel);
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -143,7 +144,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel.get());
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel.get(), localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -200,7 +202,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel);
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -255,7 +258,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel);
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -310,7 +314,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel);
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
@@ -365,7 +370,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu
|
|||||||
|
|
||||||
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {});
|
||||||
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel);
|
size_t localWorkSizes[] = {256, 1, 1};
|
||||||
|
auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
// Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
|
||||||
|
|||||||
@@ -365,7 +365,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
|||||||
auto usedAfterIOH = ioh.getUsed();
|
auto usedAfterIOH = ioh.getUsed();
|
||||||
auto usedAfterSSH = ssh.getUsed();
|
auto usedAfterSSH = ssh.getUsed();
|
||||||
auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
|
||||||
auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSize);
|
auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSizes);
|
||||||
auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
|
||||||
|
|
||||||
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
|
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
|
||||||
@@ -548,11 +548,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
|||||||
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||||
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||||
auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||||
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize);
|
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||||
|
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, gfxCoreHelper);
|
||||||
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
||||||
|
|
||||||
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
||||||
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
|
||||||
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
|
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
|
||||||
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
|
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
|
||||||
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
|
||||||
@@ -1315,8 +1315,8 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
|||||||
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(expectedImplicitArgs.globalSizeX), static_cast<uint32_t>(expectedImplicitArgs.globalSizeY), static_cast<uint32_t>(expectedImplicitArgs.globalSizeZ));
|
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(expectedImplicitArgs.globalSizeX), static_cast<uint32_t>(expectedImplicitArgs.globalSizeY), static_cast<uint32_t>(expectedImplicitArgs.globalSizeZ));
|
||||||
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
|
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
|
||||||
kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ);
|
kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ);
|
||||||
|
const auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor());
|
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper);
|
||||||
|
|
||||||
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||||
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||||
@@ -1382,7 +1382,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
@@ -1416,7 +1416,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
||||||
|
|
||||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
|
||||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
|
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper);
|
||||||
|
|
||||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||||
alignedFree(expectedLocalIds);
|
alignedFree(expectedLocalIds);
|
||||||
|
|||||||
@@ -161,7 +161,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||||||
sizePerThreadData, hwInfo);
|
sizePerThreadData, hwInfo);
|
||||||
|
|
||||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor);
|
bool isHwLocalIdGeneration = false;
|
||||||
|
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper);
|
||||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||||
uint64_t offsetThreadData = 0u;
|
uint64_t offsetThreadData = 0u;
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -200,7 +200,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor);
|
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, gfxCoreHelper);
|
||||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||||
{
|
{
|
||||||
auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
|
auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "shared/source/helpers/gfx_core_helper.h"
|
||||||
#include "shared/source/helpers/local_id_gen.h"
|
#include "shared/source/helpers/local_id_gen.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
@@ -19,8 +20,10 @@ struct PerThreadDataHelper {
|
|||||||
uint32_t simd,
|
uint32_t simd,
|
||||||
uint32_t grfSize,
|
uint32_t grfSize,
|
||||||
uint32_t numChannels,
|
uint32_t numChannels,
|
||||||
size_t localWorkSize) {
|
size_t localWorkSize,
|
||||||
return getThreadsPerWG(simd, static_cast<uint32_t>(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
bool isHwLocalIdGeneration,
|
||||||
|
const GfxCoreHelper &gfxCoreHelper) {
|
||||||
|
return gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
||||||
}
|
}
|
||||||
}; // namespace PerThreadDataHelper
|
}; // namespace PerThreadDataHelper
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ inline constexpr const char *implicitArgsRelocationSymbolName = "__INTEL_PATCH_C
|
|||||||
namespace ImplicitArgsHelper {
|
namespace ImplicitArgsHelper {
|
||||||
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
|
||||||
uint32_t getGrfSize(uint32_t simd);
|
uint32_t getGrfSize(uint32_t simd);
|
||||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor);
|
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const GfxCoreHelper &gfxCoreHelper);
|
||||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper);
|
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper);
|
||||||
} // namespace ImplicitArgsHelper
|
} // namespace ImplicitArgsHelper
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ uint32_t getGrfSize(uint32_t simd) {
|
|||||||
return 32u;
|
return 32u;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor) {
|
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const GfxCoreHelper &gfxCoreHelper) {
|
||||||
if (!pImplicitArgs) {
|
if (!pImplicitArgs) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -56,15 +56,15 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
|
|||||||
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
||||||
uint32_t localIdsSizeNeeded =
|
uint32_t localIdsSizeNeeded =
|
||||||
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||||
simdSize, grfSize, 3u, itemsInGroup)),
|
simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, gfxCoreHelper)),
|
||||||
MemoryConstants::cacheLineSize);
|
MemoryConstants::cacheLineSize);
|
||||||
return implicitArgsSize + localIdsSizeNeeded;
|
return implicitArgsSize + localIdsSizeNeeded;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) {
|
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) {
|
||||||
|
auto localIdsGeneratedByHw = hwGenerationOfLocalIdsParams.has_value() ? hwGenerationOfLocalIdsParams.value().first : false;
|
||||||
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor);
|
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, gfxCoreHelper);
|
||||||
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
||||||
|
|
||||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||||
|
|||||||
@@ -57,8 +57,8 @@ TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenGrfSiz
|
|||||||
TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) {
|
||||||
|
|
||||||
KernelDescriptor kernelDescriptor{};
|
KernelDescriptor kernelDescriptor{};
|
||||||
|
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||||
EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor));
|
EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, *gfxCoreHelper.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||||
@@ -75,8 +75,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||||||
|
|
||||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||||
|
|
||||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize), MemoryConstants::cacheLineSize);
|
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||||
EXPECT_EQ(localIdsSize + implicitArgs.structSize, ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor));
|
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, *gfxCoreHelper.get()), MemoryConstants::cacheLineSize);
|
||||||
|
EXPECT_EQ(localIdsSize + implicitArgs.structSize, ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||||
@@ -90,8 +91,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
|
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||||
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor));
|
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||||
@@ -108,8 +109,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
|
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor);
|
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get());
|
||||||
|
|
||||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||||
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
|
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
|
||||||
@@ -119,7 +120,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||||||
uint8_t pattern = 0xcd;
|
uint8_t pattern = 0xcd;
|
||||||
|
|
||||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
|
||||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
||||||
|
|
||||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||||
@@ -150,8 +151,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||||||
implicitArgs.localSizeX = 2;
|
implicitArgs.localSizeX = 2;
|
||||||
implicitArgs.localSizeY = 3;
|
implicitArgs.localSizeY = 3;
|
||||||
implicitArgs.localSizeZ = 4;
|
implicitArgs.localSizeZ = 4;
|
||||||
|
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
||||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor);
|
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get());
|
||||||
|
|
||||||
EXPECT_EQ(0x80u, totalSizeForPatching);
|
EXPECT_EQ(0x80u, totalSizeForPatching);
|
||||||
|
|
||||||
@@ -160,7 +161,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||||||
uint8_t pattern = 0xcd;
|
uint8_t pattern = 0xcd;
|
||||||
|
|
||||||
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
memset(memoryToPatch.get(), pattern, totalSizeForPatching);
|
||||||
auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily);
|
|
||||||
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get());
|
||||||
|
|
||||||
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));
|
||||||
|
|||||||
Reference in New Issue
Block a user