mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
Modify function dispatching cross and per-thread data
Related-To: NEO-4585 Change-Id: Ia6b54b8d0c868cab5403332411655dc8c9ef4c8d Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
97aa485048
commit
bac5506b62
@@ -119,6 +119,24 @@ struct KernelHw : public KernelImp {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
|
||||
size_t localWorkSizes[3];
|
||||
localWorkSizes[0] = this->groupSize[0];
|
||||
localWorkSizes[1] = this->groupSize[1];
|
||||
localWorkSizes[2] = this->groupSize[2];
|
||||
|
||||
kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||
kernelDescriptor.kernelAttributes.numLocalIdChannels,
|
||||
localWorkSizes,
|
||||
std::array<uint8_t, 3>{
|
||||
{kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
|
||||
kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
|
||||
kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
|
||||
kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
|
||||
requiredWorkgroupOrder,
|
||||
kernelDescriptor.kernelAttributes.simdSize);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -237,37 +237,14 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
DEBUG_BREAK_IF(true);
|
||||
return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
|
||||
}
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
kernelImmData->getDescriptor().kernelAttributes.simdSize, grfSize, numChannels, itemsInGroup));
|
||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||
alignedFree(perThreadDataForWholeThreadGroup);
|
||||
perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
|
||||
perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
|
||||
}
|
||||
perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
|
||||
|
||||
if (numChannels > 0) {
|
||||
UNRECOVERABLE_IF(3 != numChannels);
|
||||
NEO::generateLocalIDs(
|
||||
perThreadDataForWholeThreadGroup,
|
||||
static_cast<uint16_t>(kernelImmData->getDescriptor().kernelAttributes.simdSize),
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
|
||||
static_cast<uint16_t>(groupSizeY),
|
||||
static_cast<uint16_t>(groupSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||
false, grfSize);
|
||||
}
|
||||
|
||||
this->groupSize[0] = groupSizeX;
|
||||
this->groupSize[1] = groupSizeY;
|
||||
this->groupSize[2] = groupSizeZ;
|
||||
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
|
||||
auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize;
|
||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
|
||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
|
||||
|
||||
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
|
||||
@@ -275,7 +252,35 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
if (!threadExecutionMask) {
|
||||
threadExecutionMask = static_cast<uint32_t>(maxNBitValue((simdSize == 1) ? 32 : simdSize));
|
||||
}
|
||||
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
|
||||
|
||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
simdSize, grfSize, numChannels, itemsInGroup));
|
||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||
alignedFree(perThreadDataForWholeThreadGroup);
|
||||
perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
|
||||
perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
|
||||
}
|
||||
perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
|
||||
|
||||
if (numChannels > 0) {
|
||||
UNRECOVERABLE_IF(3 != numChannels);
|
||||
NEO::generateLocalIDs(
|
||||
perThreadDataForWholeThreadGroup,
|
||||
static_cast<uint16_t>(simdSize),
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
|
||||
static_cast<uint16_t>(groupSizeY),
|
||||
static_cast<uint16_t>(groupSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||
false, grfSize);
|
||||
}
|
||||
|
||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -105,6 +105,9 @@ struct KernelImp : Kernel {
|
||||
uint32_t getSlmTotalSize() const override;
|
||||
NEO::GraphicsAllocation *getIsaAllocation() const override;
|
||||
|
||||
uint32_t getRequiredWorkgroupOrder() const override { return requiredWorkgroupOrder; }
|
||||
bool requiresGenerationOfLocalIdsByRuntime() const override { return kernelRequiresGenerationOfLocalIdsByRuntime; }
|
||||
|
||||
protected:
|
||||
KernelImp() = default;
|
||||
|
||||
@@ -112,6 +115,7 @@ struct KernelImp : Kernel {
|
||||
|
||||
void createPrintfBuffer();
|
||||
void setDebugSurface();
|
||||
virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0;
|
||||
|
||||
const KernelImmutableData *kernelImmData = nullptr;
|
||||
Module *module = nullptr;
|
||||
@@ -143,6 +147,9 @@ struct KernelImp : Kernel {
|
||||
UnifiedMemoryControls unifiedMemoryControls;
|
||||
std::vector<uint32_t> slmArgSizes;
|
||||
uint32_t slmArgsTotalSize = 0U;
|
||||
uint32_t requiredWorkgroupOrder = 0u;
|
||||
|
||||
bool kernelRequiresGenerationOfLocalIdsByRuntime = true;
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -40,18 +40,22 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp {
|
||||
using ::L0::KernelImp::crossThreadDataSize;
|
||||
using ::L0::KernelImp::groupSize;
|
||||
using ::L0::KernelImp::kernelImmData;
|
||||
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
|
||||
using ::L0::KernelImp::module;
|
||||
using ::L0::KernelImp::numThreadsPerThreadGroup;
|
||||
using ::L0::KernelImp::perThreadDataForWholeThreadGroup;
|
||||
using ::L0::KernelImp::perThreadDataSize;
|
||||
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
|
||||
using ::L0::KernelImp::printfBuffer;
|
||||
using ::L0::KernelImp::requiredWorkgroupOrder;
|
||||
using ::L0::KernelImp::residencyContainer;
|
||||
using ::L0::KernelImp::unifiedMemoryControls;
|
||||
|
||||
void setBufferSurfaceState(uint32_t argIndex, void *address,
|
||||
NEO::GraphicsAllocation *alloc) override {}
|
||||
|
||||
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
|
||||
|
||||
std::unique_ptr<Kernel> clone() const override { return nullptr; }
|
||||
|
||||
WhiteBox() : ::L0::KernelImp(nullptr) {}
|
||||
@@ -85,6 +89,7 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
|
||||
}
|
||||
|
||||
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
|
||||
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
|
||||
std::unique_ptr<Kernel> clone() const override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -51,6 +51,22 @@ HWTEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFro
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSettingGroupSizeThenLocalIdsAreNotGenerated) {
|
||||
Mock<Kernel> mockKernel;
|
||||
Mock<Module> mockModule(this->device, nullptr);
|
||||
mockKernel.descriptor.kernelAttributes.simdSize = 1;
|
||||
mockKernel.module = &mockModule;
|
||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = false;
|
||||
|
||||
uint32_t groupSize[3] = {2, 3, 5};
|
||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||
EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
|
||||
EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
|
||||
EXPECT_EQ(0u, mockKernel.perThreadDataSize);
|
||||
EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
|
||||
}
|
||||
|
||||
using SetKernelArg = Test<ModuleFixture>;
|
||||
using ImageSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;
|
||||
|
||||
@@ -234,5 +250,11 @@ HWTEST_F(KernelPropertiesTest, givenKernelThenPropertiesAreRetrieved) {
|
||||
Kernel::fromHandle(kernelHandle)->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(KernelPropertiesTest, WhenKernelIsCreatedThenDefaultLocalIdGenerationbyRuntimeIsTrue) {
|
||||
createKernel();
|
||||
|
||||
EXPECT_TRUE(kernel->requiresGenerationOfLocalIdsByRuntime());
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user