Modify function dispatching cross and per-thread data

Related-To: NEO-4585

Change-Id: Ia6b54b8d0c868cab5403332411655dc8c9ef4c8d
Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2020-07-06 22:55:37 +02:00
committed by sys_ocldev
parent 97aa485048
commit bac5506b62
13 changed files with 283 additions and 82 deletions

View File

@@ -119,6 +119,24 @@ struct KernelHw : public KernelImp {
return ret;
}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
size_t localWorkSizes[3];
localWorkSizes[0] = this->groupSize[0];
localWorkSizes[1] = this->groupSize[1];
localWorkSizes[2] = this->groupSize[2];
kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
kernelDescriptor.kernelAttributes.numLocalIdChannels,
localWorkSizes,
std::array<uint8_t, 3>{
{kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
requiredWorkgroupOrder,
kernelDescriptor.kernelAttributes.simdSize);
}
};
} // namespace L0

View File

@@ -237,37 +237,14 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
DEBUG_BREAK_IF(true);
return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
}
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
kernelImmData->getDescriptor().kernelAttributes.simdSize, grfSize, numChannels, itemsInGroup));
if (perThreadDataSizeForWholeThreadGroupNeeded >
perThreadDataSizeForWholeThreadGroupAllocated) {
alignedFree(perThreadDataForWholeThreadGroup);
perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
}
perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
if (numChannels > 0) {
UNRECOVERABLE_IF(3 != numChannels);
NEO::generateLocalIDs(
perThreadDataForWholeThreadGroup,
static_cast<uint16_t>(kernelImmData->getDescriptor().kernelAttributes.simdSize),
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
static_cast<uint16_t>(groupSizeY),
static_cast<uint16_t>(groupSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}},
false, grfSize);
}
this->groupSize[0] = groupSizeX;
this->groupSize[1] = groupSizeY;
this->groupSize[2] = groupSizeZ;
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize;
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
@@ -275,7 +252,35 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
if (!threadExecutionMask) {
threadExecutionMask = static_cast<uint32_t>(maxNBitValue((simdSize == 1) ? 32 : simdSize));
}
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
simdSize, grfSize, numChannels, itemsInGroup));
if (perThreadDataSizeForWholeThreadGroupNeeded >
perThreadDataSizeForWholeThreadGroupAllocated) {
alignedFree(perThreadDataForWholeThreadGroup);
perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
}
perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
if (numChannels > 0) {
UNRECOVERABLE_IF(3 != numChannels);
NEO::generateLocalIDs(
perThreadDataForWholeThreadGroup,
static_cast<uint16_t>(simdSize),
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
static_cast<uint16_t>(groupSizeY),
static_cast<uint16_t>(groupSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}},
false, grfSize);
}
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
}
return ZE_RESULT_SUCCESS;
}

View File

@@ -105,6 +105,9 @@ struct KernelImp : Kernel {
uint32_t getSlmTotalSize() const override;
NEO::GraphicsAllocation *getIsaAllocation() const override;
uint32_t getRequiredWorkgroupOrder() const override { return requiredWorkgroupOrder; }
bool requiresGenerationOfLocalIdsByRuntime() const override { return kernelRequiresGenerationOfLocalIdsByRuntime; }
protected:
KernelImp() = default;
@@ -112,6 +115,7 @@ struct KernelImp : Kernel {
void createPrintfBuffer();
void setDebugSurface();
virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0;
const KernelImmutableData *kernelImmData = nullptr;
Module *module = nullptr;
@@ -143,6 +147,9 @@ struct KernelImp : Kernel {
UnifiedMemoryControls unifiedMemoryControls;
std::vector<uint32_t> slmArgSizes;
uint32_t slmArgsTotalSize = 0U;
uint32_t requiredWorkgroupOrder = 0u;
bool kernelRequiresGenerationOfLocalIdsByRuntime = true;
};
} // namespace L0

View File

@@ -40,18 +40,22 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp {
using ::L0::KernelImp::crossThreadDataSize;
using ::L0::KernelImp::groupSize;
using ::L0::KernelImp::kernelImmData;
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
using ::L0::KernelImp::module;
using ::L0::KernelImp::numThreadsPerThreadGroup;
using ::L0::KernelImp::perThreadDataForWholeThreadGroup;
using ::L0::KernelImp::perThreadDataSize;
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
using ::L0::KernelImp::printfBuffer;
using ::L0::KernelImp::requiredWorkgroupOrder;
using ::L0::KernelImp::residencyContainer;
using ::L0::KernelImp::unifiedMemoryControls;
void setBufferSurfaceState(uint32_t argIndex, void *address,
NEO::GraphicsAllocation *alloc) override {}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
std::unique_ptr<Kernel> clone() const override { return nullptr; }
WhiteBox() : ::L0::KernelImp(nullptr) {}
@@ -85,6 +89,7 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
}
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
std::unique_ptr<Kernel> clone() const override {
return nullptr;
}

View File

@@ -51,6 +51,22 @@ HWTEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFro
}
}
HWTEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSettingGroupSizeThenLocalIdsAreNotGenerated) {
Mock<Kernel> mockKernel;
Mock<Module> mockModule(this->device, nullptr);
mockKernel.descriptor.kernelAttributes.simdSize = 1;
mockKernel.module = &mockModule;
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = false;
uint32_t groupSize[3] = {2, 3, 5};
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
EXPECT_EQ(0u, mockKernel.perThreadDataSize);
EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
}
using SetKernelArg = Test<ModuleFixture>;
using ImageSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;
@@ -234,5 +250,11 @@ HWTEST_F(KernelPropertiesTest, givenKernelThenPropertiesAreRetrieved) {
Kernel::fromHandle(kernelHandle)->destroy();
}
HWTEST_F(KernelPropertiesTest, WhenKernelIsCreatedThenDefaultLocalIdGenerationbyRuntimeIsTrue) {
createKernel();
EXPECT_TRUE(kernel->requiresGenerationOfLocalIdsByRuntime());
}
} // namespace ult
} // namespace L0