Refactor implicit scaling parameters for surface state

Related-To: NEO-6589

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-01-14 17:50:42 +00:00
committed by Compute-Runtime-Automation
parent 79c8605ed2
commit c36c083812
9 changed files with 105 additions and 18 deletions

View File

@@ -266,6 +266,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
args.gmmHelper = neoDevice->getGmmHelper();
args.useGlobalAtomics = kernelImp->getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics;
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
args.implicitScaling = this->partitionCount > 1;
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -63,12 +63,14 @@ struct KernelHw : public KernelImp {
bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
bool l3Enabled = true;
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
// Most commonly this issue will occur with Host Point Allocations from customers.
l3Enabled = isL3Capable(*alloc);
auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
Device *device = module->getDevice();
NEO::Device *neoDevice = device->getNEODevice();
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
l3Enabled = false;
}
@@ -77,18 +79,17 @@ struct KernelHw : public KernelImp {
this->kernelRequiresQueueUncachedMocsCount++;
}
NEO::Device *neoDevice = module->getDevice()->getNEODevice();
NEO::EncodeSurfaceStateArgs args;
args.outMemory = &surfaceState;
args.graphicsAddress = bufferAddressForSsh;
args.size = bufferSizeForSsh;
args.mocs = this->module->getDevice()->getMOCS(l3Enabled, false);
args.mocs = device->getMOCS(l3Enabled, false);
args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
args.allocation = alloc;
args.gmmHelper = neoDevice->getGmmHelper();
args.useGlobalAtomics = kernelImmData->getDescriptor().kernelAttributes.flags.useGlobalAtomics;
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
args.implicitScaling = device->isImplicitScalingCapable();
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;

View File

@@ -84,7 +84,8 @@ KernelImmutableData::~KernelImmutableData() {
inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics,
bool implicitScaling) {
if (false == crossThreadData.empty()) {
NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
}
@@ -107,6 +108,7 @@ inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef
args.numAvailableDevices = device.getNumGenericSubDevices();
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
args.implicitScaling = implicitScaling;
hwHelper.encodeBufferSurfaceState(args);
}
@@ -179,7 +181,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
*globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
this->residencyContainer.push_back(globalConstBuffer);
} else if (nullptr != globalConstBuffer) {
this->residencyContainer.push_back(globalConstBuffer);
@@ -191,7 +193,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
*globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
this->residencyContainer.push_back(globalVarBuffer);
} else if (nullptr != globalVarBuffer) {
this->residencyContainer.push_back(globalVarBuffer);
@@ -758,7 +760,7 @@ void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocatio
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
*privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
*device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
*device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
}
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
@@ -941,7 +943,7 @@ void KernelImp::setDebugSurface() {
patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
0,
*device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
*device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
*device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
}
}
void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/helpers/file_io.h"
#include "shared/source/memory_manager/allocation_properties.h"
#include "shared/source/program/kernel_info.h"
@@ -392,5 +393,26 @@ struct ImportHostPointerModuleFixture : public ModuleFixture {
void *hostPointer = nullptr;
};
struct MultiTileModuleFixture : public MultiDeviceModuleFixture {
void SetUp() {
DebugManager.flags.EnableImplicitScaling.set(1);
MultiDeviceFixture::numRootDevices = 1u;
MultiDeviceFixture::numSubDevices = 2u;
MultiDeviceModuleFixture::SetUp();
createModuleFromBinary(0);
device = driverHandle->devices[0];
}
void TearDown() {
MultiDeviceModuleFixture::TearDown();
}
DebugManagerStateRestore debugRestore;
VariableBackup<bool> backup{&NEO::ImplicitScaling::apiSupport, true};
L0::Device *device = nullptr;
};
} // namespace ult
} // namespace L0

View File

@@ -2328,5 +2328,34 @@ TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplici
EXPECT_EQ(0, memcmp(data, initData, 64));
}
using MultiTileModuleTest = Test<MultiTileModuleFixture>;
HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXEHP) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
ze_kernel_desc_t desc = {};
desc.pKernelName = kernelName.c_str();
WhiteBoxKernelHw<gfxCoreFamily> mockKernel;
mockKernel.module = modules[0].get();
mockKernel.initialize(&desc);
auto &arg = const_cast<NEO::ArgDescPointer &>(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as<NEO::ArgDescPointer>());
arg.bindless = undefined<CrossThreadDataOffset>;
arg.bindful = 0x40;
constexpr size_t size = 128;
uint64_t gpuAddress = 0x2000;
char bufferArray[size] = {};
void *buffer = reinterpret_cast<void *>(bufferArray);
NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size);
mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation);
void *surfaceStateAddress = ptrOffset(mockKernel.surfaceStateHeapData.get(), arg.bindful);
RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(surfaceStateAddress);
EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuAtomics());
EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuPartialWrites());
}
} // namespace ult
} // namespace L0