refactor: move Kernel data members to KernelImp::sharedState

The class Kernel is abstract and as such it should not have any members.
Move its members to sharedState as this is what they represent.

Related-To: NEO-15374
Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
This commit is contained in:
Maciej Bielski
2025-08-29 12:17:48 +00:00
committed by Compute-Runtime-Automation
parent c1c1f1f0af
commit 91a4809a79
8 changed files with 89 additions and 102 deletions

View File

@@ -173,31 +173,13 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI, NEO::N
virtual ze_result_t setSchedulingHintExp(ze_scheduling_hint_exp_desc_t *pHint) = 0;
virtual uint32_t getMaxWgCountPerTile(NEO::EngineGroupType engineGroupType) const = 0;
static Kernel *fromHandle(ze_kernel_handle_t handle) { return static_cast<Kernel *>(handle); }
inline ze_kernel_handle_t toHandle() { return this; }
uint32_t getMaxWgCountPerTile(NEO::EngineGroupType engineGroupType) const {
auto value = maxWgCountPerTileCcs;
if (engineGroupType == NEO::EngineGroupType::renderCompute) {
value = maxWgCountPerTileRcs;
} else if (engineGroupType == NEO::EngineGroupType::cooperativeCompute) {
value = maxWgCountPerTileCooperative;
}
return value;
}
virtual uint32_t getIndirectSize() const = 0;
protected:
uint32_t maxWgCountPerTileCcs = 0;
uint32_t maxWgCountPerTileRcs = 0;
uint32_t maxWgCountPerTileCooperative = 0;
bool heaplessEnabled = false;
bool implicitScalingEnabled = false;
bool localDispatchSupport = false;
bool rcsAvailable = false;
bool cooperativeSupport = false;
};
using KernelAllocatorFn = Kernel *(*)(Module *module);

View File

@@ -520,13 +520,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
this->privateState.perThreadDataSize = 0;
}
if (this->heaplessEnabled && this->localDispatchSupport) {
this->maxWgCountPerTileCcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::compute, true);
if (this->rcsAvailable) {
this->maxWgCountPerTileRcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::renderCompute, true);
if (this->sharedState->heaplessEnabled && this->sharedState->localDispatchSupport) {
this->sharedState->maxWgCountPerTileCcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::compute, true);
if (this->sharedState->rcsAvailable) {
this->sharedState->maxWgCountPerTileRcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::renderCompute, true);
}
if (this->cooperativeSupport) {
this->maxWgCountPerTileCooperative = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::cooperativeCompute, true);
if (this->sharedState->cooperativeSupport) {
this->sharedState->maxWgCountPerTileCooperative = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::cooperativeCompute, true);
}
}
return ZE_RESULT_SUCCESS;
@@ -611,7 +611,7 @@ uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineG
workDim,
localWorkSize,
engineGroupType,
this->implicitScalingEnabled,
this->sharedState->implicitScalingEnabled,
forceSingleTileQuery);
}
@@ -714,8 +714,8 @@ ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle
auto patchLocation = ptrOffset(getCrossThreadData(), arg.bindless);
// redescribed image's surface state is after image's implicit args and sampler
uint64_t bindlessSlotOffset = ssInHeap->surfaceStateOffset + surfaceStateSize * bindlessSlot;
uint32_t patchSize = this->heaplessEnabled ? 8u : 4u;
uint64_t patchValue = this->heaplessEnabled
uint32_t patchSize = this->sharedState->heaplessEnabled ? 8u : 4u;
uint64_t patchValue = this->sharedState->heaplessEnabled
? bindlessSlotOffset
: gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(bindlessSlotOffset));
@@ -917,7 +917,7 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void
auto patchLocation = ptrOffset(getCrossThreadData(), arg.bindless);
auto bindlessSlotOffset = ssInHeap->surfaceStateOffset;
uint32_t patchSize = NEO::isUndefined(arg.size) ? 0 : arg.size;
uint64_t patchValue = this->heaplessEnabled
uint64_t patchValue = this->sharedState->heaplessEnabled
? bindlessSlotOffset
: gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(bindlessSlotOffset));
@@ -1122,11 +1122,12 @@ void KernelImp::setInlineSamplers() {
}
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
this->sharedState->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
if (this->sharedState->kernelImmData == nullptr) {
auto &sharedState = *(this->sharedState);
sharedState.kernelImmData = module->getKernelImmutableData(desc->pKernelName);
if (sharedState.kernelImmData == nullptr) {
return ZE_RESULT_ERROR_INVALID_KERNEL_NAME;
}
auto &kernelImmData = *(this->sharedState->kernelImmData);
auto &kernelImmData = *(sharedState.kernelImmData);
auto neoDevice = module->getDevice()->getNEODevice();
auto &kernelDescriptor = kernelImmData.getDescriptor();
@@ -1156,16 +1157,16 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
auto deviceBitfield = neoDevice->getDeviceBitfield();
const auto &gfxHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->heaplessEnabled = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>().isHeaplessModeEnabled(hwInfo);
sharedState.heaplessEnabled = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>().isHeaplessModeEnabled(hwInfo);
bool platformImplicitScaling = gfxHelper.platformSupportsImplicitScaling(rootDeviceEnvironment);
this->implicitScalingEnabled = NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling);
sharedState.implicitScalingEnabled = NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling);
this->rcsAvailable = gfxHelper.isRcsAvailable(hwInfo);
this->cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo);
this->sharedState->walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize();
this->sharedState->surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask();
this->sharedState->surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment();
sharedState.rcsAvailable = gfxHelper.isRcsAvailable(hwInfo);
sharedState.cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo);
sharedState.walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize();
sharedState.surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask();
sharedState.surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment();
if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && kernelImmData.getIsaParentAllocation() == nullptr) {
isaAllocation->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
@@ -1238,7 +1239,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
privateState.pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
*privateState.pImplicitArgs = {};
privateState.pImplicitArgs->initializeHeader(this->sharedState->implicitArgsVersion);
privateState.pImplicitArgs->initializeHeader(sharedState.implicitArgsVersion);
privateState.pImplicitArgs->setSimdWidth(kernelDescriptor.kernelAttributes.simdSize);
}
@@ -1262,12 +1263,12 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
auto &kernelAttributes = kernelDescriptor.kernelAttributes;
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
this->sharedState->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
if (this->sharedState->privateMemoryGraphicsAllocation == nullptr) {
sharedState.privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
if (sharedState.privateMemoryGraphicsAllocation == nullptr) {
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
}
this->patchCrossthreadDataWithPrivateAllocation(this->sharedState->privateMemoryGraphicsAllocation);
this->privateState.internalResidencyContainer.push_back(this->sharedState->privateMemoryGraphicsAllocation);
this->patchCrossthreadDataWithPrivateAllocation(sharedState.privateMemoryGraphicsAllocation);
this->privateState.internalResidencyContainer.push_back(sharedState.privateMemoryGraphicsAllocation);
}
this->createPrintfBuffer();
@@ -1342,15 +1343,6 @@ std::unique_ptr<KernelImp> KernelImp::cloneWithStateOverride(const KernelMutable
clone->cloneOrigin = this;
clone->sharedState = this->sharedState;
// Kernel-specific members dynamically set in `initailize()` but shareable with clones
clone->maxWgCountPerTileCcs = this->maxWgCountPerTileCcs;
clone->maxWgCountPerTileRcs = this->maxWgCountPerTileRcs;
clone->maxWgCountPerTileCooperative = this->maxWgCountPerTileCooperative;
clone->heaplessEnabled = this->heaplessEnabled;
clone->implicitScalingEnabled = this->implicitScalingEnabled;
clone->rcsAvailable = this->rcsAvailable;
clone->cooperativeSupport = this->cooperativeSupport;
if (stateOverride) {
clone->privateState = *stateOverride;
}
@@ -1359,18 +1351,19 @@ std::unique_ptr<KernelImp> KernelImp::cloneWithStateOverride(const KernelMutable
}
void KernelImp::createPrintfBuffer() {
auto &sharedState = *(this->sharedState);
if (this->getImmutableData()->getDescriptor().kernelAttributes.flags.usesPrintf || privateState.pImplicitArgs) {
this->sharedState->printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
this->privateState.internalResidencyContainer.push_back(this->sharedState->printfBuffer);
sharedState.printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
this->privateState.internalResidencyContainer.push_back(sharedState.printfBuffer);
if (this->getImmutableData()->getDescriptor().kernelAttributes.flags.usesPrintf) {
NEO::patchPointer(getCrossThreadDataSpan(),
this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.printfSurfaceAddress,
static_cast<uintptr_t>(this->sharedState->printfBuffer->getGpuAddressToPatch()));
static_cast<uintptr_t>(sharedState.printfBuffer->getGpuAddressToPatch()));
}
if (privateState.pImplicitArgs) {
privateState.pImplicitArgs->setPrintfBuffer(this->sharedState->printfBuffer->getGpuAddress());
privateState.pImplicitArgs->setPrintfBuffer(sharedState.printfBuffer->getGpuAddress());
}
this->sharedState->devicePrintfKernelMutex = &(static_cast<DeviceImp *>(this->module->getDevice())->printfKernelMutex);
sharedState.devicePrintfKernelMutex = &(static_cast<DeviceImp *>(this->module->getDevice())->printfKernelMutex);
}
}

View File

@@ -205,6 +205,16 @@ struct KernelImp : Kernel {
NEO::ImplicitArgs *getImplicitArgs() const override { return privateState.pImplicitArgs.get(); }
uint32_t getMaxWgCountPerTile(NEO::EngineGroupType engineGroupType) const override {
auto value = this->sharedState->maxWgCountPerTileCcs;
if (engineGroupType == NEO::EngineGroupType::renderCompute) {
value = this->sharedState->maxWgCountPerTileRcs;
} else if (engineGroupType == NEO::EngineGroupType::cooperativeCompute) {
value = this->sharedState->maxWgCountPerTileCooperative;
}
return value;
}
KernelExt *getExtension(uint32_t extensionType);
bool checkKernelContainsStatefulAccess();

View File

@@ -31,6 +31,16 @@ struct KernelSharedState {
uint32_t implicitArgsVersion = 0;
uint32_t walkerInlineDataSize = 0;
uint32_t maxWgCountPerTileCcs = 0;
uint32_t maxWgCountPerTileRcs = 0;
uint32_t maxWgCountPerTileCooperative = 0;
bool heaplessEnabled = false;
bool implicitScalingEnabled = false;
bool localDispatchSupport = false;
bool rcsAvailable = false;
bool cooperativeSupport = false;
};
} // namespace L0

View File

@@ -40,7 +40,7 @@ uint32_t KernelImpSuggestMaxCooperativeGroupCountFixture::getMaxWorkGroupCount()
kernel.sharedState->kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::user);
kernel.module = module.get();
kernel.implicitScalingEnabled = device->getNEODevice()->getDeviceBitfield().count() > 1;
kernel.sharedState->implicitScalingEnabled = device->getNEODevice()->getDeviceBitfield().count() > 1;
kernel.privateState.groupSize[0] = lws[0];
kernel.privateState.groupSize[1] = lws[1];
kernel.privateState.groupSize[2] = lws[2];

View File

@@ -41,23 +41,15 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using BaseClass = ::L0::KernelImp;
using BaseClass::BaseClass;
using ::L0::KernelImp::cloneOrigin;
using ::L0::KernelImp::cooperativeSupport;
using ::L0::KernelImp::createPrintfBuffer;
using ::L0::KernelImp::getCrossThreadDataSpan;
using ::L0::KernelImp::getDynamicStateHeapDataSpan;
using ::L0::KernelImp::getSurfaceStateHeapDataSpan;
using ::L0::KernelImp::heaplessEnabled;
using ::L0::KernelImp::implicitScalingEnabled;
using ::L0::KernelImp::localDispatchSupport;
using ::L0::KernelImp::maxWgCountPerTileCcs;
using ::L0::KernelImp::maxWgCountPerTileCooperative;
using ::L0::KernelImp::maxWgCountPerTileRcs;
using ::L0::KernelImp::module;
using ::L0::KernelImp::patchBindlessOffsetsInCrossThreadData;
using ::L0::KernelImp::patchBindlessSurfaceState;
using ::L0::KernelImp::patchSamplerBindlessOffsetsInCrossThreadData;
using ::L0::KernelImp::privateState;
using ::L0::KernelImp::rcsAvailable;
using ::L0::KernelImp::setAssertBuffer;
using ::L0::KernelImp::sharedState;

View File

@@ -3168,7 +3168,7 @@ HWTEST2_F(SetKernelArg, givenHeaplessWhenPatchingImageWithBindlessEnabledCorrect
for (auto heaplessEnabled : {false, true}) {
createKernel();
kernel->heaplessEnabled = heaplessEnabled;
kernel->sharedState->heaplessEnabled = heaplessEnabled;
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice,
neoDevice->getNumGenericSubDevices() > 1);
@@ -3199,11 +3199,11 @@ HWTEST2_F(SetKernelArg, givenHeaplessWhenPatchingImageWithBindlessEnabledCorrect
auto ssInHeap = imageHW->getBindlessSlot();
auto patchLocation = ptrOffset(ctd, imageArg.bindless);
uint64_t bindlessSlotOffset = ssInHeap->surfaceStateOffset + surfaceStateSize * NEO::BindlessImageSlot::redescribedImage;
uint64_t expectedPatchValue = kernel->heaplessEnabled
uint64_t expectedPatchValue = kernel->sharedState->heaplessEnabled
? bindlessSlotOffset
: gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(bindlessSlotOffset));
if (kernel->heaplessEnabled) {
if (kernel->sharedState->heaplessEnabled) {
uint64_t patchedValued = *(reinterpret_cast<uint64_t *>(patchLocation));
EXPECT_EQ(expectedPatchValue, patchedValued);
} else {

View File

@@ -272,13 +272,13 @@ TEST_F(KernelImpTest, GivenKernelMutableStateWhenKernelImpClonedThenStateAssigne
EXPECT_EQ(kernel2->sharedState->surfaceStateAlignment, kernel1.sharedState->surfaceStateAlignment);
EXPECT_EQ(kernel2->sharedState->implicitArgsVersion, kernel1.sharedState->implicitArgsVersion);
EXPECT_EQ(kernel2->sharedState->walkerInlineDataSize, kernel1.sharedState->walkerInlineDataSize);
EXPECT_EQ(kernel2->maxWgCountPerTileCcs, kernel1.maxWgCountPerTileCcs);
EXPECT_EQ(kernel2->maxWgCountPerTileRcs, kernel1.maxWgCountPerTileRcs);
EXPECT_EQ(kernel2->maxWgCountPerTileCooperative, kernel1.maxWgCountPerTileCooperative);
EXPECT_EQ(kernel2->heaplessEnabled, kernel1.heaplessEnabled);
EXPECT_EQ(kernel2->implicitScalingEnabled, kernel1.implicitScalingEnabled);
EXPECT_EQ(kernel2->rcsAvailable, kernel1.rcsAvailable);
EXPECT_EQ(kernel2->cooperativeSupport, kernel1.cooperativeSupport);
EXPECT_EQ(kernel2->sharedState->maxWgCountPerTileCcs, kernel1.sharedState->maxWgCountPerTileCcs);
EXPECT_EQ(kernel2->sharedState->maxWgCountPerTileRcs, kernel1.sharedState->maxWgCountPerTileRcs);
EXPECT_EQ(kernel2->sharedState->maxWgCountPerTileCooperative, kernel1.sharedState->maxWgCountPerTileCooperative);
EXPECT_EQ(kernel2->sharedState->heaplessEnabled, kernel1.sharedState->heaplessEnabled);
EXPECT_EQ(kernel2->sharedState->implicitScalingEnabled, kernel1.sharedState->implicitScalingEnabled);
EXPECT_EQ(kernel2->sharedState->rcsAvailable, kernel1.sharedState->rcsAvailable);
EXPECT_EQ(kernel2->sharedState->cooperativeSupport, kernel1.sharedState->cooperativeSupport);
}
TEST_F(KernelImpTest, GivenCrossThreadDataThenIsCorrectlyPatchedWithGlobalWorkSizeAndGroupCount) {
@@ -1201,41 +1201,41 @@ TEST_F(KernelImpTest, givenHeaplessAndLocalDispatchEnabledWheSettingGroupSizeThe
Mock<::L0::KernelImp> kernel;
kernel.module = &module;
kernel.heaplessEnabled = false;
kernel.localDispatchSupport = false;
kernel.sharedState->heaplessEnabled = false;
kernel.sharedState->localDispatchSupport = false;
kernel.setGroupSize(128, 1, 1);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCooperative);
kernel.heaplessEnabled = true;
kernel.sharedState->heaplessEnabled = true;
kernel.setGroupSize(64, 2, 1);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCooperative);
kernel.localDispatchSupport = true;
kernel.sharedState->localDispatchSupport = true;
kernel.setGroupSize(32, 4, 1);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCooperative);
kernel.rcsAvailable = true;
kernel.sharedState->rcsAvailable = true;
kernel.setGroupSize(16, 8, 1);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.sharedState->maxWgCountPerTileCooperative);
kernel.cooperativeSupport = true;
kernel.sharedState->cooperativeSupport = true;
kernel.setGroupSize(8, 8, 2);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileCooperative);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileRcs);
EXPECT_NE(0u, kernel.sharedState->maxWgCountPerTileCooperative);
}
TEST_F(KernelImpTest, givenCorrectEngineTypeWhenGettingMaxWgCountPerTileThenReturnActualValue) {
@@ -1243,9 +1243,9 @@ TEST_F(KernelImpTest, givenCorrectEngineTypeWhenGettingMaxWgCountPerTileThenRetu
Mock<::L0::KernelImp> kernel;
kernel.module = &module;
kernel.maxWgCountPerTileCcs = 4;
kernel.maxWgCountPerTileRcs = 2;
kernel.maxWgCountPerTileCooperative = 100;
kernel.sharedState->maxWgCountPerTileCcs = 4;
kernel.sharedState->maxWgCountPerTileRcs = 2;
kernel.sharedState->maxWgCountPerTileCooperative = 100;
EXPECT_EQ(4u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::compute));
EXPECT_EQ(2u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::renderCompute));