diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 5ae0f31b91..5ba7afe2bd 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -905,7 +905,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { uint32_t bvhLevels = NEO::RayTracingHelper::maxBvhLevels; auto arg = this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals; if (arg.pointerSize == 0) { - // application is allocating its own RTDispatchGlobals manually + // kernel is allocating its own RTDispatchGlobals manually neoDevice->initializeRayTracing(0); } else { neoDevice->initializeRayTracing(bvhLevels); @@ -914,18 +914,17 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { return ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - for (auto rtStack : rtDispatchGlobalsInfo->rtStacks) { - this->residencyContainer.push_back(rtStack); + for (auto rtDispatchGlobals : rtDispatchGlobalsInfo->rtDispatchGlobals) { + this->residencyContainer.push_back(rtDispatchGlobals); } - auto address = rtDispatchGlobalsInfo->rtDispatchGlobalsArray->getGpuAddressToPatch(); + auto address = rtDispatchGlobalsInfo->rtDispatchGlobals[0]->getGpuAddressToPatch(); NEO::patchPointer(ArrayRef(crossThreadData.get(), crossThreadDataSize), arg, static_cast(address)); - this->residencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray); + this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer()); } - this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer()); } return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 46b496107e..6dfba03af9 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -897,7 +897,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndNoRTDispatchGlobalsIs neoDevice->executionEnvironment->memoryManager.swap(otherMemoryManager); } -HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFailsThenRayTracingIsNotInitialized, IsAtLeastXeHpgCore) { +HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTDispatchGlobalsArrayAllocationFailsThenRayTracingIsNotInitialized, IsAtLeastXeHpgCore) { KernelDescriptor mockDescriptor = {}; mockDescriptor.kernelAttributes.flags.hasRTCalls = true; mockDescriptor.kernelMetadata.kernelName = "rt_test"; @@ -931,7 +931,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFail neoDevice->rtDispatchGlobalsForceAllocation = false; std::unique_ptr otherMemoryManager; - // Ensure that allocating RTDispatchGlobals succeeds, but first RTStack allocation fails. + // Ensure that allocating RTDispatchGlobals succeeds, but the array allocation fails. otherMemoryManager = std::make_unique(1, *neoDevice->executionEnvironment); neoDevice->executionEnvironment->memoryManager.swap(otherMemoryManager); @@ -1059,7 +1059,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenCrossThreadDataIsPatche EXPECT_NE(nullptr, rtDispatchGlobals); auto dispatchGlobalsAddressPatched = *reinterpret_cast(ptrOffset(crossThreadData.get(), rtGlobalPointerPatchOffset)); - auto dispatchGlobalsGpuAddressOffset = static_cast(rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch()); + auto dispatchGlobalsGpuAddressOffset = static_cast(rtDispatchGlobals->rtDispatchGlobals[0]->getGpuAddressToPatch()); EXPECT_EQ(dispatchGlobalsGpuAddressOffset, dispatchGlobalsAddressPatched); kernel->crossThreadData.release(); diff --git a/opencl/test/unit_test/helpers/ray_tracing_helper_tests.cpp b/opencl/test/unit_test/helpers/ray_tracing_helper_tests.cpp index 01d4f51529..051b79ecf7 100644 --- a/opencl/test/unit_test/helpers/ray_tracing_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/ray_tracing_helper_tests.cpp @@ -33,22 +33,17 @@ TEST(RayTracingHelperTests, whenMemoryBackedFifoSizeIsRequestedThenCorrectValueI } TEST(RayTracingHelperTests, whenGlobalDispatchSizeIsRequestedThenCorrectValueIsReturned) { - size_t expectedSize = alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize); - size_t size = RayTracingHelper::getDispatchGlobalSize(); - EXPECT_EQ(expectedSize, size); -} - -TEST(RayTracingHelperTests, whenRTStackSizeIsRequestedThenCorrectValueIsReturned) { MockClDevice device{new MockDevice}; MockContext context(&device); uint32_t maxBvhLevel = 2; uint32_t extraBytesLocal = 20; uint32_t extraBytesGlobal = 100; - uint32_t tiles = 2; - size_t expectedSize = RayTracingHelper::getStackSizePerRay(maxBvhLevel, extraBytesLocal) * (RayTracingHelper::getNumRtStacks(device.getDevice()) / tiles) + extraBytesGlobal; - size_t size = RayTracingHelper::getRTStackSizePerTile(device.getDevice(), tiles, maxBvhLevel, extraBytesLocal, extraBytesGlobal); + size_t expectedSize = alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize) + + (RayTracingHelper::hitInfoSize + RayTracingHelper::bvhStackSize * maxBvhLevel + extraBytesLocal) * RayTracingHelper::getNumRtStacks(device.getDevice()) + + extraBytesGlobal; + size_t size = RayTracingHelper::getDispatchGlobalSize(device.getDevice(), maxBvhLevel, extraBytesLocal, extraBytesGlobal); EXPECT_EQ(expectedSize, size); } diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 5c006ad4e2..ac1f09ebd5 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -663,13 +663,13 @@ void Device::finalizeRayTracing() { if (rtDispatchGlobalsInfo == nullptr) { continue; } - for (size_t j = 0; j < rtDispatchGlobalsInfo->rtStacks.size(); j++) { - getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtStacks[j]); - rtDispatchGlobalsInfo->rtStacks[j] = nullptr; + for (size_t j = 0; j < rtDispatchGlobalsInfo->rtDispatchGlobals.size(); j++) { + getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobals[j]); + rtDispatchGlobalsInfo->rtDispatchGlobals[j] = nullptr; } - getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobalsArray); - rtDispatchGlobalsInfo->rtDispatchGlobalsArray = nullptr; + getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation); + rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = nullptr; delete rtDispatchGlobalsInfos[i]; rtDispatchGlobalsInfos[i] = nullptr; @@ -749,16 +749,11 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) { uint32_t extraBytesLocal = 0; uint32_t extraBytesGlobal = 0; - uint32_t dispatchGlobalsStride = MemoryConstants::pageSize64k; - UNRECOVERABLE_IF(RayTracingHelper::getDispatchGlobalSize() > dispatchGlobalsStride); - - bool allocFailed = false; + auto size = RayTracingHelper::getDispatchGlobalSize(*this, maxBvhLevels, extraBytesLocal, extraBytesGlobal); const auto deviceCount = HwHelper::getSubDevicesCount(executionEnvironment->rootDeviceEnvironments[getRootDeviceIndex()]->getHardwareInfo()); - auto dispatchGlobalsSize = deviceCount * dispatchGlobalsStride; - auto rtStackSize = RayTracingHelper::getRTStackSizePerTile(*this, deviceCount, maxBvhLevels, extraBytesLocal, extraBytesGlobal); - std::unique_ptr dispatchGlobalsInfo = std::make_unique(); + auto dispatchGlobalsInfo = new RTDispatchGlobalsInfo(nullptr); if (dispatchGlobalsInfo == nullptr) { return; } @@ -766,38 +761,25 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) { auto &hwInfo = getHardwareInfo(); auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); - GraphicsAllocation *dispatchGlobalsArrayAllocation = nullptr; - - AllocationProperties arrayAllocProps(getRootDeviceIndex(), true, dispatchGlobalsSize, - AllocationType::BUFFER, true, getDeviceBitfield()); - arrayAllocProps.flags.resource48Bit = true; - arrayAllocProps.flags.isUSMDeviceAllocation = true; - dispatchGlobalsArrayAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(arrayAllocProps); - - if (dispatchGlobalsArrayAllocation == nullptr) { - return; - } + std::vector gpuAddressVector; + bool allocFailed = false; for (unsigned int tile = 0; tile < deviceCount; tile++) { - DeviceBitfield deviceBitfield = - (deviceCount == 1) - ? this->getDeviceBitfield() - : subdevices[tile]->getDeviceBitfield(); - - AllocationProperties allocProps(getRootDeviceIndex(), true, rtStackSize, AllocationType::BUFFER, true, deviceBitfield); + AllocationProperties allocProps(getRootDeviceIndex(), true, size, AllocationType::BUFFER, true, getDeviceBitfield()); allocProps.flags.resource48Bit = true; allocProps.flags.isUSMDeviceAllocation = true; - auto rtStackAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(allocProps); + auto dispatchGlobalsAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(allocProps); - if (rtStackAllocation == nullptr) { + if (dispatchGlobalsAllocation == nullptr) { allocFailed = true; break; } + auto dispatchGlobalsPtr = dispatchGlobalsAllocation->getGpuAddress(); struct RTDispatchGlobals dispatchGlobals = {0}; - dispatchGlobals.rtMemBasePtr = rtStackAllocation->getGpuAddress(); + dispatchGlobals.rtMemBasePtr = size + dispatchGlobalsPtr; dispatchGlobals.callStackHandlerKSP = reinterpret_cast(nullptr); dispatchGlobals.stackSizePerRay = 0; dispatchGlobals.numDSSRTStacks = RayTracingHelper::stackDssMultiplier; @@ -806,27 +788,45 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) { uint32_t *dispatchGlobalsAsArray = reinterpret_cast(&dispatchGlobals); dispatchGlobalsAsArray[7] = 1; - MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsArrayAllocation), + MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsAllocation), *this, - dispatchGlobalsArrayAllocation, - tile * dispatchGlobalsStride, + dispatchGlobalsAllocation, + 0, &dispatchGlobals, sizeof(RTDispatchGlobals)); - dispatchGlobalsInfo->rtStacks.push_back(rtStackAllocation); + dispatchGlobalsInfo->rtDispatchGlobals.push_back(dispatchGlobalsAllocation); + gpuAddressVector.push_back(dispatchGlobalsAllocation->getGpuAddress()); } - if (allocFailed) { - for (auto allocation : dispatchGlobalsInfo->rtStacks) { + GraphicsAllocation *dispatchGlobalsArrayAllocation = nullptr; + size_t arrayAllocSize = sizeof(uint64_t) * deviceCount; + + if (!allocFailed) { + AllocationProperties arrayAllocProps(getRootDeviceIndex(), true, arrayAllocSize, + AllocationType::BUFFER, true, getDeviceBitfield()); + arrayAllocProps.flags.resource48Bit = true; + arrayAllocProps.flags.isUSMDeviceAllocation = true; + dispatchGlobalsArrayAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(arrayAllocProps); + } + + if (dispatchGlobalsArrayAllocation == nullptr) { + for (auto allocation : dispatchGlobalsInfo->rtDispatchGlobals) { getMemoryManager()->freeGraphicsMemory(allocation); } - - getMemoryManager()->freeGraphicsMemory(dispatchGlobalsArrayAllocation); + delete dispatchGlobalsInfo; return; } - dispatchGlobalsInfo->rtDispatchGlobalsArray = dispatchGlobalsArrayAllocation; - rtDispatchGlobalsInfos[maxBvhLevels] = dispatchGlobalsInfo.release(); + MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsArrayAllocation), + *this, + dispatchGlobalsArrayAllocation, + 0, + gpuAddressVector.data(), + arrayAllocSize); + + dispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = dispatchGlobalsArrayAllocation; + rtDispatchGlobalsInfos[maxBvhLevels] = dispatchGlobalsInfo; } } // namespace NEO diff --git a/shared/source/device/device.h b/shared/source/device/device.h index 7ae2f5e03e..efc5d76ba4 100644 --- a/shared/source/device/device.h +++ b/shared/source/device/device.h @@ -41,8 +41,10 @@ struct EngineGroupT { using EngineGroupsT = std::vector; struct RTDispatchGlobalsInfo { - GraphicsAllocation *rtDispatchGlobalsArray = nullptr; - std::vector rtStacks; // per tile + RTDispatchGlobalsInfo(GraphicsAllocation *rtDispatchGlobalsArrayAllocation) + : rtDispatchGlobalsArrayAllocation(rtDispatchGlobalsArrayAllocation){}; + std::vector rtDispatchGlobals; // per tile + GraphicsAllocation *rtDispatchGlobalsArrayAllocation; // above array as visible from device }; class Device : public ReferenceTrackedObject { diff --git a/shared/source/helpers/ray_tracing_helper.h b/shared/source/helpers/ray_tracing_helper.h index 7aa288c403..65b6f65d04 100644 --- a/shared/source/helpers/ray_tracing_helper.h +++ b/shared/source/helpers/ray_tracing_helper.h @@ -24,12 +24,10 @@ class RayTracingHelper : public NonCopyableOrMovableClass { static constexpr uint32_t memoryBackedFifoSizePerDss = 8 * KB; static constexpr uint32_t maxBvhLevels = 8; - static size_t getDispatchGlobalSize() { - return static_cast(alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize)); - } - - static size_t getRTStackSizePerTile(const Device &device, uint32_t tiles, uint32_t maxBvhLevel, uint32_t extraBytesLocal, uint32_t extraBytesGlobal) { - return static_cast(getStackSizePerRay(maxBvhLevel, extraBytesLocal) * (getNumRtStacks(device) / tiles) + extraBytesGlobal); + static size_t getDispatchGlobalSize(const Device &device, uint32_t maxBvhLevel, uint32_t extraBytesLocal, uint32_t extraBytesGlobal) { + return static_cast(alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize) + + getStackSizePerRay(maxBvhLevel, extraBytesLocal) * getNumRtStacks(device) + + extraBytesGlobal); } static size_t getTotalMemoryBackedFifoSize(const Device &device) { diff --git a/shared/test/common/mocks/mock_device.h b/shared/test/common/mocks/mock_device.h index 8f20984aae..3d55e85fa3 100644 --- a/shared/test/common/mocks/mock_device.h +++ b/shared/test/common/mocks/mock_device.h @@ -168,12 +168,12 @@ class MockDevice : public RootDevice { for (unsigned int i = 0; i < rtDispatchGlobalsInfos.size(); i++) { auto rtDispatchGlobalsInfo = rtDispatchGlobalsInfos[i]; if (rtDispatchGlobalsForceAllocation == true && rtDispatchGlobalsInfo != nullptr) { - for (unsigned int j = 0; j < rtDispatchGlobalsInfo->rtStacks.size(); j++) { - delete rtDispatchGlobalsInfo->rtStacks[j]; - rtDispatchGlobalsInfo->rtStacks[j] = nullptr; + for (unsigned int j = 0; j < rtDispatchGlobalsInfo->rtDispatchGlobals.size(); j++) { + delete rtDispatchGlobalsInfo->rtDispatchGlobals[j]; + rtDispatchGlobalsInfo->rtDispatchGlobals[j] = nullptr; } - delete rtDispatchGlobalsInfo->rtDispatchGlobalsArray; - rtDispatchGlobalsInfo->rtDispatchGlobalsArray = nullptr; + delete rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation; + rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = nullptr; delete rtDispatchGlobalsInfos[i]; rtDispatchGlobalsInfos[i] = nullptr; } diff --git a/shared/test/unit_test/device/neo_device_tests.cpp b/shared/test/unit_test/device/neo_device_tests.cpp index 641cd2b3dc..89abbf5f2c 100644 --- a/shared/test/unit_test/device/neo_device_tests.cpp +++ b/shared/test/unit_test/device/neo_device_tests.cpp @@ -98,26 +98,6 @@ TEST_F(DeviceTest, whenAllocateRTDispatchGlobalsIsCalledThenRTDispatchGlobalsIsA EXPECT_NE(nullptr, pDevice->getRTDispatchGlobals(3)); } -HWTEST2_F(DeviceTest, whenAllocateRTDispatchGlobalsIsCalledAndRTStackAllocationFailsRTDispatchGlobalsIsNotAllocated, IsPVC) { - DebugManagerStateRestore dbgRestorer; - - DebugManager.flags.CreateMultipleSubDevices.set(2); - pDevice->deviceBitfield = 3; - - pDevice->subdevices.push_back(new SubDevice(pDevice->executionEnvironment, 0, *pDevice)); - pDevice->subdevices.push_back(new SubDevice(pDevice->executionEnvironment, 1, *pDevice)); - - std::unique_ptr otherMemoryManager; - otherMemoryManager = std::make_unique(*pDevice->executionEnvironment); - static_cast(*otherMemoryManager).capacity = 25000000; - pDevice->executionEnvironment->memoryManager.swap(otherMemoryManager); - - pDevice->initializeRayTracing(5); - EXPECT_EQ(nullptr, pDevice->getRTDispatchGlobals(3)); - - pDevice->executionEnvironment->memoryManager.swap(otherMemoryManager); -} - TEST_F(DeviceTest, givenDispatchGlobalsAllocationFailsThenRTDispatchGlobalsInfoIsNull) { std::unique_ptr otherMemoryManager; otherMemoryManager = std::make_unique(1, *pDevice->getExecutionEnvironment()); @@ -644,4 +624,4 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DeviceTests, givenNonDebuggableOsContextWhenDeviceC auto device = deviceFactory.rootDevices[0]; auto csr = device->allEngines[device->defaultEngineIndex].commandStreamReceiver; EXPECT_EQ(0u, csr->peekLatestSentTaskCount()); -} +} \ No newline at end of file