fix: add missing sba capture when debug enabled

- In global bindless mode SBA may be programmed for scratch allocation,
missing sba capturing caused issues for kernels with scratch.
- this fix adds SBA capturing after SBA command

Related-To: NEO-7063

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2024-08-19 17:01:03 +00:00
committed by Compute-Runtime-Automation
parent 579af57161
commit 4c3a0d8344
8 changed files with 248 additions and 4 deletions

View File

@@ -8,6 +8,7 @@
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/bindless_heaps_helper.h"
#include "shared/source/helpers/file_io.h"
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/indirect_heap/indirect_heap.h"
@@ -149,5 +150,125 @@ HWTEST2_F(DebuggerSingleAddressSpaceAub, GivenSingleAddressSpaceWhenCmdListIsExe
driverHandle->svmAllocsManager->freeSVMAlloc(bufferDst);
}
struct DebuggerSingleAddressSpaceGlobalBindlessAllocatorAubFixture : public DebuggerAubFixture {
void setUp() {
NEO::debugManager.flags.UseExternalAllocatorForSshAndDsh.set(1);
NEO::debugManager.flags.UseBindlessMode.set(1);
DebuggerAubFixture::setUp();
}
void tearDown() {
DebuggerAubFixture::tearDown();
}
};
using DebuggerGlobalAllocatorAub = Test<DebuggerSingleAddressSpaceGlobalBindlessAllocatorAubFixture>;
using PlatformsSupportingGlobalBindless = IsWithinGfxCore<IGFX_XE_HP_CORE, IGFX_XE2_HPG_CORE>;
HWTEST2_F(DebuggerGlobalAllocatorAub, GivenKernelWithScratchWhenCmdListExecutedThenSbaAddressesAreTracked, PlatformsSupportingGlobalBindless) {
const uint32_t arraySize = 32;
const uint32_t typeSize = sizeof(int);
uint32_t bufferSize = (arraySize * 2 + 1) * typeSize - 4;
const uint32_t groupSize[] = {arraySize, 1, 1};
const uint32_t groupCount[] = {1, 1, 1};
memoryManager = neoDevice->getMemoryManager();
gmmHelper = neoDevice->getGmmHelper();
rootDeviceIndex = neoDevice->getRootDeviceIndex();
NEO::debugManager.flags.UpdateCrossThreadDataSize.set(true);
ASSERT_NE(nullptr, neoDevice->getBindlessHeapsHelper());
NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory,
1,
context->rootDeviceIndices,
context->deviceBitfields);
auto bufferDst = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
memset(bufferDst, 0, bufferSize);
auto bufferSrc = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
memset(bufferSrc, 0, bufferSize);
auto bufferOffset = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(128 * arraySize, unifiedMemoryProperties);
memset(bufferOffset, 0, 128 * arraySize);
int *srcBufferInt = static_cast<int *>(bufferSrc);
std::unique_ptr<int[]> expectedMemoryInt = std::make_unique<int[]>(bufferSize / typeSize);
const int expectedVal1 = 16256;
const int expectedVal2 = 512;
for (uint32_t i = 0; i < arraySize; ++i) {
srcBufferInt[i] = 2;
expectedMemoryInt[i * 2] = expectedVal1;
expectedMemoryInt[i * 2 + 1] = expectedVal2;
}
auto simulatedCsr = AUBFixtureL0::getSimulatedCsr<FamilyType>();
simulatedCsr->initializeEngine();
simulatedCsr->writeMemory(*driverHandle->svmAllocsManager->getSVMAlloc(bufferDst)->gpuAllocations.getDefaultGraphicsAllocation());
ze_group_count_t dispatchTraits;
dispatchTraits.groupCountX = groupCount[0];
dispatchTraits.groupCountY = groupCount[1];
dispatchTraits.groupCountZ = groupCount[2];
module = static_cast<L0::ModuleImp *>(Module::fromHandle(createModuleFromFile("simple_spill_fill_kernel", context, device, "", true)));
ze_kernel_handle_t kernel;
ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernelDesc.pKernelName = "spill_test";
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelCreate(module->toHandle(), &kernelDesc, &kernel));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetArgumentValue(kernel, 0, sizeof(void *), &bufferSrc));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetArgumentValue(kernel, 1, sizeof(void *), &bufferDst));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetArgumentValue(kernel, 2, sizeof(void *), &bufferOffset));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetGroupSize(kernel, groupSize[0], groupSize[1], groupSize[2]));
ze_command_list_handle_t cmdListHandle = commandList->toHandle();
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListAppendLaunchKernel(cmdListHandle, kernel, &dispatchTraits, nullptr, 0, nullptr));
commandList->close();
pCmdq->executeCommandLists(1, &cmdListHandle, nullptr, false, nullptr);
pCmdq->synchronize(std::numeric_limits<uint64_t>::max());
expectMemory<FamilyType>(reinterpret_cast<void *>(driverHandle->svmAllocsManager->getSVMAlloc(bufferDst)->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()),
expectedMemoryInt.get(), bufferSize);
const auto sbaAddress = device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())->getGpuAddress();
auto instructionHeapBaseAddress = memoryManager->getInternalHeapBaseAddress(rootDeviceIndex,
memoryManager->isLocalMemoryUsedForIsa(rootDeviceIndex));
instructionHeapBaseAddress = gmmHelper->canonize(instructionHeapBaseAddress);
expectMemory<FamilyType>(reinterpret_cast<void *>(sbaAddress + offsetof(NEO::SbaTrackedAddresses, instructionBaseAddress)),
&instructionHeapBaseAddress, sizeof(instructionHeapBaseAddress));
auto commandListSurfaceHeapAllocation = commandList->commandContainer.getIndirectHeap(HeapType::surfaceState);
auto surfaceStateBaseAddress = commandListSurfaceHeapAllocation->getGraphicsAllocation()->getGpuAddress();
surfaceStateBaseAddress = gmmHelper->canonize(surfaceStateBaseAddress);
expectMemory<FamilyType>(reinterpret_cast<void *>(sbaAddress + offsetof(NEO::SbaTrackedAddresses, surfaceStateBaseAddress)),
&surfaceStateBaseAddress, sizeof(surfaceStateBaseAddress));
auto bindlessSurfaceStateBaseAddress = neoDevice->getBindlessHeapsHelper()->getGlobalHeapsBase();
expectMemory<FamilyType>(reinterpret_cast<void *>(sbaAddress + offsetof(NEO::SbaTrackedAddresses, bindlessSurfaceStateBaseAddress)),
&bindlessSurfaceStateBaseAddress, sizeof(bindlessSurfaceStateBaseAddress));
auto commandListDynamicHeapAllocation = commandList->commandContainer.getIndirectHeap(HeapType::dynamicState);
if (commandListDynamicHeapAllocation) {
auto dynamicStateBaseAddress = commandListDynamicHeapAllocation->getGraphicsAllocation()->getGpuAddress();
dynamicStateBaseAddress = gmmHelper->canonize(dynamicStateBaseAddress);
expectMemory<FamilyType>(reinterpret_cast<void *>(sbaAddress + offsetof(NEO::SbaTrackedAddresses, dynamicStateBaseAddress)),
&bindlessSurfaceStateBaseAddress, sizeof(bindlessSurfaceStateBaseAddress));
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelDestroy(kernel));
driverHandle->svmAllocsManager->freeSVMAlloc(bufferDst);
driverHandle->svmAllocsManager->freeSVMAlloc(bufferSrc);
driverHandle->svmAllocsManager->freeSVMAlloc(bufferOffset);
}
} // namespace ult
} // namespace L0

View File

@@ -105,10 +105,14 @@ void AUBFixtureL0::tearDown() {
pCmdq->destroy();
}
ze_module_handle_t AUBFixtureL0::createModuleFromFile(const std::string &fileName, ze_context_handle_t context, ze_device_handle_t device, const std::string &buildFlags) {
ze_module_handle_t AUBFixtureL0::createModuleFromFile(const std::string &fileName, ze_context_handle_t context, ze_device_handle_t device, const std::string &buildFlags, bool useSharedFile) {
ze_module_handle_t moduleHandle;
std::string testFile;
retrieveBinaryKernelFilenameApiSpecific(testFile, fileName + "_", ".bin");
if (useSharedFile) {
retrieveBinaryKernelFilename(testFile, fileName + "_", ".bin");
} else {
retrieveBinaryKernelFilenameApiSpecific(testFile, fileName + "_", ".bin");
}
size_t size = 0;
auto src = loadDataFromFile(testFile.c_str(), size);
@@ -129,4 +133,5 @@ ze_module_handle_t AUBFixtureL0::createModuleFromFile(const std::string &fileNam
EXPECT_EQ(ZE_RESULT_SUCCESS, zeModuleCreate(context, device, &moduleDesc, &moduleHandle, nullptr));
return moduleHandle;
}
} // namespace L0

View File

@@ -101,7 +101,7 @@ class AUBFixtureL0 {
}
}
static ze_module_handle_t createModuleFromFile(const std::string &fileName, ze_context_handle_t context, ze_device_handle_t device, const std::string &buildFlags);
static ze_module_handle_t createModuleFromFile(const std::string &fileName, ze_context_handle_t context, ze_device_handle_t device, const std::string &buildFlags, bool useSharedFile = false);
std::string aubFileName;
std::unique_ptr<VariableBackup<NEO::UltHwConfig>> backupUltConfig;

View File

@@ -10,5 +10,7 @@
namespace L0 {
namespace ult {
HWTEST_EXCLUDE_PRODUCT(DebuggerSingleAddressSpaceAub, GivenSingleAddressSpaceWhenCmdListIsExecutedThenSbaAddressesAreTracked_PlatformsSupportingSingleAddressSpace, IGFX_XE_HPG_CORE);
}
HWTEST_EXCLUDE_PRODUCT(DebuggerGlobalAllocatorAub, GivenKernelWithScratchWhenCmdListExecutedThenSbaAddressesAreTracked_PlatformsSupportingGlobalBindless, IGFX_XE_HPG_CORE);
} // namespace ult
} // namespace L0

View File

@@ -111,6 +111,19 @@ struct L0DebuggerPerContextAddressSpaceFixture : public L0DebuggerHwFixture {
DebugManagerStateRestore restorer;
};
struct L0DebuggerPerContextAddressSpaceGlobalBindlessFixture : public L0DebuggerHwFixture {
void setUp() {
NEO::debugManager.flags.DebuggerForceSbaTrackingMode.set(0);
NEO::debugManager.flags.UseBindlessMode.set(1);
NEO::debugManager.flags.UseExternalAllocatorForSshAndDsh.set(1);
L0DebuggerHwFixture::setUp();
}
void tearDown() {
L0DebuggerHwFixture::tearDown();
}
DebugManagerStateRestore restorer;
};
struct L0DebuggerSingleAddressSpaceFixture : public L0DebuggerHwFixture {
void setUp() {
NEO::debugManager.flags.DebuggerForceSbaTrackingMode.set(1);

View File

@@ -17,6 +17,8 @@
#include "level_zero/core/source/cmdlist/cmdlist.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
#include "level_zero/core/test/unit_tests/sources/debugger/l0_debugger_fixture.h"
namespace L0 {
@@ -179,6 +181,90 @@ HWTEST2_F(L0DebuggerPerContextAddressSpaceTest, givenDebuggingEnabledAndRequired
commandQueue->destroy();
}
using L0DebuggerPerContextAddressSpaceGlobalBindlessTest = Test<L0DebuggerPerContextAddressSpaceGlobalBindlessFixture>;
using PlatformsSupportingGlobalBindless = IsWithinGfxCore<IGFX_XE_HP_CORE, IGFX_XE2_HPG_CORE>;
HWTEST2_F(L0DebuggerPerContextAddressSpaceGlobalBindlessTest, givenDebuggingEnabledAndRequiredSshWhenCommandListIsExecutedThenProgramSsbaWritesToSbaTrackingBuffer, PlatformsSupportingGlobalBindless) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
ze_command_queue_desc_t queueDesc = {};
ze_result_t returnValue;
auto cmdQ = CommandQueue::create(productFamily, device, neoDevice->getDefaultEngine().commandStreamReceiver, &queueDesc, false, false, false, returnValue);
ASSERT_NE(nullptr, cmdQ);
auto commandQueue = whiteboxCast(cmdQ);
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
ze_command_list_handle_t commandLists[] = {commandList->toHandle()};
Mock<Module> module(device, nullptr, ModuleType::user);
Mock<KernelImp> kernel;
kernel.module = &module;
ze_group_count_t groupCount{1, 1, 1};
kernel.descriptor.kernelAttributes.perThreadScratchSize[0] = 0x40;
CmdListKernelLaunchParams launchParams = {};
auto result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
CommandList::fromHandle(commandLists[0])->close();
uint32_t numCommandLists = sizeof(commandLists) / sizeof(commandLists[0]);
result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto usedSpaceAfter = commandList->getCmdContainer().getCommandStream()->getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList, ptrOffset(commandList->getCmdContainer().getCommandStream()->getCpuBase(), 0), usedSpaceAfter));
auto sbaItors = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, sbaItors.size());
auto sbaItor = sbaItors[sbaItors.size() - 1];
ASSERT_NE(cmdList.end(), sbaItor);
auto cmdSba = genCmdCast<STATE_BASE_ADDRESS *>(*sbaItor);
auto sdiItors = findAll<MI_STORE_DATA_IMM *>(sbaItor, cmdList.end());
ASSERT_NE(0u, sdiItors.size());
auto cmdSdi = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItors[0]);
auto gmmHelper = neoDevice->getGmmHelper();
auto expectedSshGpuVa = commandList->getCmdContainer().getIndirectHeap(HeapType::surfaceState)->getGpuBase();
for (size_t i = 0; i < sdiItors.size(); i++) {
cmdSdi = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItors[i]);
uint64_t address = cmdSdi->getDataDword1();
address <<= 32;
address = address | cmdSdi->getDataDword0();
if (expectedSshGpuVa == address) {
break;
}
cmdSdi = nullptr;
}
ASSERT_NE(nullptr, cmdSdi);
uint64_t ssbaGpuVa = gmmHelper->canonize(cmdSba->getSurfaceStateBaseAddress());
EXPECT_EQ(static_cast<uint32_t>(ssbaGpuVa & 0x0000FFFFFFFFULL), cmdSdi->getDataDword0());
EXPECT_EQ(static_cast<uint32_t>(ssbaGpuVa >> 32), cmdSdi->getDataDword1());
auto expectedGpuVa = gmmHelper->decanonize(device->getL0Debugger()->getSbaTrackingGpuVa()) + offsetof(NEO::SbaTrackedAddresses, surfaceStateBaseAddress);
EXPECT_EQ(expectedGpuVa, cmdSdi->getAddress());
for (auto i = 0u; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(commandLists[i]);
commandList->destroy();
}
commandQueue->destroy();
}
HWTEST2_F(L0DebuggerTest, givenDebuggingEnabledAndDebuggerLogsWhenCommandQueueIsSynchronizedThenSbaAddressesArePrinted, Gen12Plus) {
auto &compilerProductHelper = neoDevice->getCompilerProductHelper();