diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp index 92b9c9d7d8..40cc59de2c 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp @@ -10,7 +10,6 @@ #include "shared/source/helpers/register_offsets.h" #include "shared/test/unit_test/cmd_parse/gen_cmd_parse.h" -#include "opencl/source/helpers/hardware_commands_helper.h" #include "test.h" #include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h" @@ -113,7 +112,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenFunctionWhenBind auto dsh = commandList->commandContainer.getIndirectHeap(NEO::HeapType::DYNAMIC_STATE); auto idd = static_cast(ptrOffset(dsh->getCpuBase(), cmd->getInterfaceDescriptorDataStartAddress())); - if (NEO::HardwareCommandsHelper::doBindingTablePrefetch()) { + if (NEO::EncodeSurfaceState::doBindingTablePrefetch()) { uint32_t numArgs = kernel->kernelImmData->getDescriptor().payloadMappings.bindingTable.numEntries; EXPECT_EQ(numArgs, idd->getBindingTableEntryCount()); } else { diff --git a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl index 6ce83a723f..3379f7ea75 100644 --- a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl +++ b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl @@ -186,11 +186,11 @@ void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); surfaceStateHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); - auto btOffset = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount, - pBlockInfo->heapInfo.pSsh, - pBlockInfo->heapInfo.SurfaceStateHeapSize, - bindingTableCount, - pBlockInfo->patchInfo.bindingTableState->Offset); + auto btOffset = EncodeSurfaceState::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount, + pBlockInfo->heapInfo.pSsh, + pBlockInfo->heapInfo.SurfaceStateHeapSize, + bindingTableCount, + pBlockInfo->patchInfo.bindingTableState->Offset); parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast(btOffset)); diff --git a/opencl/source/gen11/hardware_commands_helper_gen11.cpp b/opencl/source/gen11/hardware_commands_helper_gen11.cpp index 04c5401b84..5fa379c526 100644 --- a/opencl/source/gen11/hardware_commands_helper_gen11.cpp +++ b/opencl/source/gen11/hardware_commands_helper_gen11.cpp @@ -14,10 +14,5 @@ namespace NEO { -template <> -bool HardwareCommandsHelper::doBindingTablePrefetch() { - return false; -} - template struct HardwareCommandsHelper; } // namespace NEO diff --git a/opencl/source/gen12lp/hardware_commands_helper_gen12lp.cpp b/opencl/source/gen12lp/hardware_commands_helper_gen12lp.cpp index 5d2ba7448b..d94571d94a 100644 --- a/opencl/source/gen12lp/hardware_commands_helper_gen12lp.cpp +++ b/opencl/source/gen12lp/hardware_commands_helper_gen12lp.cpp @@ -23,10 +23,5 @@ size_t HardwareCommandsHelper::getSizeRequiredCS(const Kernel *kern return size; } -template <> -bool HardwareCommandsHelper::doBindingTablePrefetch() { - return false; -} - template struct HardwareCommandsHelper; } // namespace NEO diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h index 7bcb3aa7c3..b04546ef7e 100644 --- a/opencl/source/helpers/hardware_commands_helper.h +++ b/opencl/source/helpers/hardware_commands_helper.h @@ -7,7 +7,6 @@ #pragma once #include "shared/source/built_ins/built_ins.h" -#include "shared/source/indirect_heap/indirect_heap.h" #include "opencl/source/helpers/per_thread_data.h" #include "opencl/source/kernel/kernel.h" @@ -78,10 +77,6 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { WALKER_TYPE *walkerCmd, uint32_t &sizeCrossThreadData); - static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, size_t bindingTableCount, - const void *srcKernelSsh, size_t srcKernelSshSize, - size_t numberOfBindingTableStates, size_t offsetOfBindingTable); - static size_t sendIndirectState( LinearStream &commandStream, IndirectHeap &dsh, @@ -143,8 +138,6 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress); - static bool doBindingTablePrefetch(); - static bool inlineDataProgrammingRequired(const Kernel &kernel); static bool kernelUsesLocalIds(const Kernel &kernel); }; diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 08da487429..8405c1e7c6 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -198,57 +198,6 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( return (size_t)offsetInterfaceDescriptor; } -// Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) -// as required by the INTERFACE_DESCRIPTOR_DATA. -template -size_t HardwareCommandsHelper::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, size_t bindingTableCount, - const void *srcKernelSsh, size_t srcKernelSshSize, - size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { - using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; - using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; - using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; - - if (bindingTableCount == 0) { - // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch - return 0; - } - size_t sshSize = srcKernelSshSize; - DEBUG_BREAK_IF(srcKernelSsh == nullptr); - - auto srcSurfaceState = srcKernelSsh; - // Allocate space for new ssh data - auto dstSurfaceState = dstHeap.getSpace(sshSize); - - // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. - // We may need to patch these pointers so that they are relative to surface state base address - if (dstSurfaceState == dstHeap.getCpuBase()) { - // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) - // we need to simply copy the ssh (including BTIs from compiler) - memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); - return offsetOfBindingTable; - } - - // We can copy-over the surface states, but BTIs will need to be patched - memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); - - uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); - - // march over BTIs and offset the pointers based on surface state base address - auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); - DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); - auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); - BINDING_TABLE_STATE bti = GfxFamily::cmdInitBindingTableState; - for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) { - uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); - uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; - bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits - dstBtiTableBase[i] = bti; - DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); - } - - return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); -} - template size_t HardwareCommandsHelper::sendIndirectState( LinearStream &commandStream, @@ -278,9 +227,9 @@ size_t HardwareCommandsHelper::sendIndirectState( ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); kernel.patchBindlessSurfaceStateOffsets(ssh.getUsed()); - auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0, - kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(), - kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset()); + auto dstBindingTablePointer = EncodeSurfaceState::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0, + kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(), + kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset()); // Copy our sampler state if it exists uint32_t samplerStateOffset = 0; @@ -378,11 +327,6 @@ void HardwareCommandsHelper::updatePerThreadDataTotal( DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group } -template -bool HardwareCommandsHelper::doBindingTablePrefetch() { - return true; -} - template bool HardwareCommandsHelper::inlineDataProgrammingRequired(const Kernel &kernel) { auto checkKernelForInlineData = true; diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl index d41134ff4b..5cdc541c09 100644 --- a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl +++ b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl @@ -139,7 +139,7 @@ size_t HardwareCommandsHelper::sendCrossThreadData( template bool HardwareCommandsHelper::resetBindingTablePrefetch(Kernel &kernel) { - return kernel.isSchedulerKernel || !doBindingTablePrefetch(); + return kernel.isSchedulerKernel || !EncodeSurfaceState::doBindingTablePrefetch(); } template diff --git a/opencl/test/unit_test/gen11/kernel_tests_gen11.cpp b/opencl/test/unit_test/gen11/kernel_tests_gen11.cpp index e5d81e3acf..33b7cfbfb9 100644 --- a/opencl/test/unit_test/gen11/kernel_tests_gen11.cpp +++ b/opencl/test/unit_test/gen11/kernel_tests_gen11.cpp @@ -18,8 +18,3 @@ GEN11TEST_F(Gen11KernelTest, givenKernelWhenCanTransformImagesIsCalledThenReturn auto retVal = mockKernel.mockKernel->Kernel::canTransformImages(); EXPECT_TRUE(retVal); } - -using Gen11HardwareCommandsTest = testing::Test; -GEN11TEST_F(Gen11HardwareCommandsTest, givenGen11PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsFalse) { - EXPECT_FALSE(HardwareCommandsHelper::doBindingTablePrefetch()); -} diff --git a/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl b/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl index 1009606672..5a45aaf7f9 100644 --- a/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl +++ b/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl @@ -18,7 +18,3 @@ GEN12LPTEST_F(Gen12LpKernelTest, givenKernelWhenCanTransformImagesIsCalledThenRe auto retVal = mockKernel.mockKernel->Kernel::canTransformImages(); EXPECT_FALSE(retVal); } -using Gen12LpHardwareCommandsTest = testing::Test; -GEN12LPTEST_F(Gen12LpHardwareCommandsTest, givenGen12LpPlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { - EXPECT_FALSE(HardwareCommandsHelper::doBindingTablePrefetch()); -} diff --git a/opencl/test/unit_test/gen8/kernel_tests_gen8.cpp b/opencl/test/unit_test/gen8/kernel_tests_gen8.cpp index 2a3e10fa0f..d00983f236 100644 --- a/opencl/test/unit_test/gen8/kernel_tests_gen8.cpp +++ b/opencl/test/unit_test/gen8/kernel_tests_gen8.cpp @@ -18,7 +18,3 @@ GEN8TEST_F(Gen8KernelTest, givenKernelWhenCanTransformImagesIsCalledThenReturnsF auto retVal = mockKernel.mockKernel->Kernel::canTransformImages(); EXPECT_FALSE(retVal); } -using Gen8HardwareCommandsTest = testing::Test; -GEN8TEST_F(Gen8HardwareCommandsTest, givenGen8PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { - EXPECT_TRUE(HardwareCommandsHelper::doBindingTablePrefetch()); -} diff --git a/opencl/test/unit_test/gen9/kernel_tests_gen9.cpp b/opencl/test/unit_test/gen9/kernel_tests_gen9.cpp index af3ef6c311..507a39b45f 100644 --- a/opencl/test/unit_test/gen9/kernel_tests_gen9.cpp +++ b/opencl/test/unit_test/gen9/kernel_tests_gen9.cpp @@ -19,7 +19,3 @@ GEN9TEST_F(Gen9KernelTest, givenKernelWhenCanTransformImagesIsCalledThenReturnsT auto retVal = mockKernel.mockKernel->Kernel::canTransformImages(); EXPECT_TRUE(retVal); } -using Gen9HardwareCommandsTest = testing::Test; -GEN9TEST_F(Gen9HardwareCommandsTest, givenGen9PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { - EXPECT_TRUE(HardwareCommandsHelper::doBindingTablePrefetch()); -} diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 08ebebf7a5..cc1aeefb8a 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -405,7 +405,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl true); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); - if (HardwareCommandsHelper::doBindingTablePrefetch()) { + if (EncodeSurfaceState::doBindingTablePrefetch()) { EXPECT_EQ(expectedBindingTableCount, interfaceDescriptor->getBindingTableEntryCount()); } else { EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); @@ -493,7 +493,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable true); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); - if (HardwareCommandsHelper::doBindingTablePrefetch()) { + if (EncodeSurfaceState::doBindingTablePrefetch()) { EXPECT_EQ(31u, interfaceDescriptor->getBindingTableEntryCount()); } else { EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); @@ -833,7 +833,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh } HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTableStatesForKernelThenSshIsNotUsed) { - // define kernel info auto pKernelInfo = std::make_unique(); @@ -889,7 +888,6 @@ HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTabl } HWTEST_F(HardwareCommandsTest, GivenZeroSurfaceStatesWhenSettingBindingTableStatesThenPointerIsZero) { - // define kernel info auto pKernelInfo = std::make_unique(); diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h index d8aff02a38..376394de96 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h @@ -6,6 +6,7 @@ */ #include "shared/source/built_ins/built_ins.h" +#include "shared/source/command_container/command_encoder.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/kernel/kernel.h" @@ -44,8 +45,8 @@ struct HardwareCommandsTest : ClDeviceFixture, template size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) { - return HardwareCommandsHelper::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0, - srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(), - srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset()); + return EncodeSurfaceState::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0, + srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(), + srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset()); } }; diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index f0d439c436..df23dda7a2 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -19,6 +19,7 @@ namespace NEO { class GmmHelper; +class IndirectHeap; template struct EncodeDispatchKernel { @@ -207,6 +208,11 @@ struct EncodeSurfaceState { static constexpr uintptr_t getSurfaceBaseAddressAlignment() { return 4; } static void getSshAlignedPointer(uintptr_t &ptr, size_t &offset); + static bool doBindingTablePrefetch(); + + static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, size_t bindingTableCount, + const void *srcKernelSsh, size_t srcKernelSshSize, + size_t numberOfBindingTableStates, size_t offsetOfBindingTable); }; template diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 8135cff407..c3aa46ac05 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -10,15 +10,16 @@ #include "shared/source/command_stream/linear_stream.h" #include "shared/source/device/device.h" #include "shared/source/execution_environment/execution_environment.h" +#include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/hw_helper.h" +#include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/register_offsets.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" - -#include "opencl/source/helpers/hardware_commands_helper.h" +#include "shared/source/kernel/kernel_descriptor.h" #include @@ -56,6 +57,12 @@ uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, return samplerStateOffsetInDsh; } + +template +size_t EncodeStates::getAdjustStateComputeModeSize() { + return 0; +} + template void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress) { int logLws = 0; @@ -208,26 +215,6 @@ void EncodeMath::addition(CommandContainer &container, finalResultRegister); } -template -void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { - for (int i = 0; i < 3; ++i) { - if (NEO::isUndefinedOffset(offsets[i])) { - continue; - } - EncodeStoreMMIO::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); - } -} - -template -void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) { - for (int i = 0; i < 3; ++i) { - if (NEO::isUndefinedOffset(offsets[i])) { - continue; - } - EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); - } -} - template inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) { LriHelper::program(container.getCommandStream(), @@ -308,6 +295,75 @@ void EncodeSurfaceState::encodeBuffer(void *dst, uint64_t address, size_ EncodeSurfaceState::encodeExtraBufferParams(surfaceState, allocation, gmmHelper, isReadOnly, numAvailableDevices); } +template +void EncodeSurfaceState::getSshAlignedPointer(uintptr_t &ptr, size_t &offset) { + auto sshAlignmentMask = + getSurfaceBaseAddressAlignmentMask(); + uintptr_t alignedPtr = ptr & sshAlignmentMask; + + offset = 0; + if (ptr != alignedPtr) { + offset = ptrDiff(ptr, alignedPtr); + ptr = alignedPtr; + } +} + +// Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) +// as required by the INTERFACE_DESCRIPTOR_DATA. +template +size_t EncodeSurfaceState::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, size_t bindingTableCount, + const void *srcKernelSsh, size_t srcKernelSshSize, + size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { + using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE; + using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; + using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; + + if (bindingTableCount == 0) { + // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch + return 0; + } + size_t sshSize = srcKernelSshSize; + DEBUG_BREAK_IF(srcKernelSsh == nullptr); + + auto srcSurfaceState = srcKernelSsh; + // Allocate space for new ssh data + auto dstSurfaceState = dstHeap.getSpace(sshSize); + + // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. + // We may need to patch these pointers so that they are relative to surface state base address + if (dstSurfaceState == dstHeap.getCpuBase()) { + // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) + // we need to simply copy the ssh (including BTIs from compiler) + memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); + return offsetOfBindingTable; + } + + // We can copy-over the surface states, but BTIs will need to be patched + memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); + + uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); + + // march over BTIs and offset the pointers based on surface state base address + auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); + DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); + auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); + BINDING_TABLE_STATE bti = Family::cmdInitBindingTableState; + for (uint32_t i = 0, e = static_cast(numberOfBindingTableStates); i != e; ++i) { + uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); + uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; + bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits + dstBtiTableBase[i] = bti; + DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); + } + + return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); +} + +template +bool EncodeSurfaceState::doBindingTablePrefetch() { + return true; +} + template void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset) { @@ -372,8 +428,23 @@ bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDes } template -size_t EncodeStates::getAdjustStateComputeModeSize() { - return 0; +void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { + for (int i = 0; i < 3; ++i) { + if (NEO::isUndefinedOffset(offsets[i])) { + continue; + } + EncodeStoreMMIO::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + } +} + +template +void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) { + for (int i = 0; i < 3; ++i) { + if (NEO::isUndefinedOffset(offsets[i])) { + continue; + } + EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + } } template @@ -440,15 +511,15 @@ void EncodeAtomic::programMiAtomic(MI_ATOMIC *atomic, *atomic = cmd; } -template -void EncodeAtomic::programMiAtomic(LinearStream &commandStream, - uint64_t writeAddress, - ATOMIC_OPCODES opcode, - DATA_SIZE dataSize, - uint32_t returnDataControl, - uint32_t csStall) { +template +void EncodeAtomic::programMiAtomic(LinearStream &commandStream, + uint64_t writeAddress, + ATOMIC_OPCODES opcode, + DATA_SIZE dataSize, + uint32_t returnDataControl, + uint32_t csStall) { auto miAtomic = commandStream.getSpaceForCmd(); - EncodeAtomic::programMiAtomic(miAtomic, writeAddress, opcode, dataSize, returnDataControl, csStall); + EncodeAtomic::programMiAtomic(miAtomic, writeAddress, opcode, dataSize, returnDataControl, csStall); } template @@ -472,19 +543,6 @@ void EncodeBatchBufferStartOrEnd::programBatchBufferEnd(CommandContainer *buffer = cmd; } -template -void EncodeSurfaceState::getSshAlignedPointer(uintptr_t &ptr, size_t &offset) { - auto sshAlignmentMask = - getSurfaceBaseAddressAlignmentMask(); - uintptr_t alignedPtr = ptr & sshAlignmentMask; - - offset = 0; - if (ptr != alignedPtr) { - offset = ptrDiff(ptr, alignedPtr); - ptr = alignedPtr; - } -} - template void EncodeMiFlushDW::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, bool timeStampOperation, bool commandWithPostSync) { programMiFlushDwWA(commandStream); diff --git a/shared/source/command_container/command_encoder_base.inl b/shared/source/command_container/command_encoder_base.inl index 2f4af564c6..c27072372a 100644 --- a/shared/source/command_container/command_encoder_base.inl +++ b/shared/source/command_container/command_encoder_base.inl @@ -16,8 +16,6 @@ #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" -#include "opencl/source/helpers/hardware_commands_helper.h" - #include "pipe_control_args.h" #include @@ -86,7 +84,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, if (bindingTableStateCount > 0u) { auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); sshOffset = ssh->getUsed(); - bindingTablePointer = static_cast(HardwareCommandsHelper::pushBindingTableAndSurfaceStates( + bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( *ssh, bindingTableStateCount, dispatchInterface->getSurfaceStateHeapData(), dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, @@ -96,7 +94,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, idd.setBindingTablePointer(bindingTablePointer); uint32_t bindingTableStatePrefetchCount = 0; - if (HardwareCommandsHelper::doBindingTablePrefetch()) { + if (EncodeSurfaceState::doBindingTablePrefetch()) { bindingTableStatePrefetchCount = std::min(31u, bindingTableStateCount); } idd.setBindingTableEntryCount(bindingTableStatePrefetchCount); diff --git a/shared/source/gen11/command_encoder_gen11.cpp b/shared/source/gen11/command_encoder_gen11.cpp index 886d77f48e..15da2f596a 100644 --- a/shared/source/gen11/command_encoder_gen11.cpp +++ b/shared/source/gen11/command_encoder_gen11.cpp @@ -16,6 +16,12 @@ using Family = NEO::ICLFamily; #include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" namespace NEO { + +template <> +bool EncodeSurfaceState::doBindingTablePrefetch() { + return false; +} + template struct EncodeDispatchKernel; template struct EncodeStates; template struct EncodeMath; diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 37ee80d70e..cb2eeb7588 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -15,6 +15,7 @@ using Family = NEO::TGLLPFamily; #include "shared/source/command_container/command_encoder.inl" #include "shared/source/command_container/command_encoder_base.inl" #include "shared/source/command_container/encode_compute_mode_tgllp_plus.inl" +#include "shared/source/command_stream/command_stream_receiver.h" namespace NEO { template <> @@ -65,6 +66,11 @@ void EncodeSurfaceState::encodeExtraBufferParams(R_SURFACE_STATE *surfac } } +template <> +bool EncodeSurfaceState::doBindingTablePrefetch() { + return false; +} + template struct EncodeDispatchKernel; template struct EncodeStates; template struct EncodeMath; diff --git a/shared/test/unit_test/gen11/CMakeLists.txt b/shared/test/unit_test/gen11/CMakeLists.txt index ae0a6b4cef..019684f437 100644 --- a/shared/test/unit_test/gen11/CMakeLists.txt +++ b/shared/test/unit_test/gen11/CMakeLists.txt @@ -17,6 +17,7 @@ if(TESTS_GEN11) target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tests_gen11.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_surface_state_tests_gen11.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_gen11.cpp ${COMPUTE_RUNTIME_ULT_GEN11} diff --git a/shared/test/unit_test/gen11/command_encoder_tests_gen11.cpp b/shared/test/unit_test/gen11/command_encoder_tests_gen11.cpp new file mode 100644 index 0000000000..e61f04b875 --- /dev/null +++ b/shared/test/unit_test/gen11/command_encoder_tests_gen11.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/command_encoder.h" + +#include "test.h" + +using namespace NEO; + +using Gen11CommandEncodeTest = testing::Test; +GEN11TEST_F(Gen11CommandEncodeTest, givenGen11PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsFalse) { + EXPECT_FALSE(EncodeSurfaceState::doBindingTablePrefetch()); +} diff --git a/shared/test/unit_test/gen12lp/CMakeLists.txt b/shared/test/unit_test/gen12lp/CMakeLists.txt index cb592dbf97..8867ebdff2 100644 --- a/shared/test/unit_test/gen12lp/CMakeLists.txt +++ b/shared/test/unit_test/gen12lp/CMakeLists.txt @@ -17,6 +17,7 @@ if(TESTS_GEN12LP) target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tests_gen12lp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_surface_state_tests_gen12lp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_gen12lp.cpp ${COMPUTE_RUNTIME_ULT_GEN12LP} diff --git a/shared/test/unit_test/gen12lp/command_encoder_tests_gen12lp.cpp b/shared/test/unit_test/gen12lp/command_encoder_tests_gen12lp.cpp new file mode 100644 index 0000000000..dc26e8f988 --- /dev/null +++ b/shared/test/unit_test/gen12lp/command_encoder_tests_gen12lp.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/command_encoder.h" + +#include "test.h" + +using namespace NEO; + +using Gen12LpCommandEncodeTest = testing::Test; +GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { + EXPECT_FALSE(EncodeSurfaceState::doBindingTablePrefetch()); +} diff --git a/shared/test/unit_test/gen8/CMakeLists.txt b/shared/test/unit_test/gen8/CMakeLists.txt index 35d0d7843b..c9b263486e 100644 --- a/shared/test/unit_test/gen8/CMakeLists.txt +++ b/shared/test/unit_test/gen8/CMakeLists.txt @@ -16,6 +16,7 @@ if(TESTS_GEN8) target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tests_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_surface_state_tests_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_tests_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_gen8.cpp diff --git a/shared/test/unit_test/gen8/command_encoder_tests_gen8.cpp b/shared/test/unit_test/gen8/command_encoder_tests_gen8.cpp new file mode 100644 index 0000000000..658796537e --- /dev/null +++ b/shared/test/unit_test/gen8/command_encoder_tests_gen8.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/command_encoder.h" + +#include "test.h" + +using namespace NEO; + +using Gen8CommandEncodeTest = testing::Test; +GEN8TEST_F(Gen8CommandEncodeTest, givenGen8PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { + EXPECT_TRUE(EncodeSurfaceState::doBindingTablePrefetch()); +} diff --git a/shared/test/unit_test/gen9/CMakeLists.txt b/shared/test/unit_test/gen9/CMakeLists.txt index b2d9bfa475..6041d6acf9 100644 --- a/shared/test/unit_test/gen9/CMakeLists.txt +++ b/shared/test/unit_test/gen9/CMakeLists.txt @@ -17,6 +17,7 @@ if(TESTS_GEN9) target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tests_gen9.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_surface_state_tests_gen9.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_gen9.cpp ${COMPUTE_RUNTIME_ULT_GEN9} diff --git a/shared/test/unit_test/gen9/command_encoder_tests_gen9.cpp b/shared/test/unit_test/gen9/command_encoder_tests_gen9.cpp new file mode 100644 index 0000000000..07785e895c --- /dev/null +++ b/shared/test/unit_test/gen9/command_encoder_tests_gen9.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/command_encoder.h" + +#include "test.h" + +using namespace NEO; + +using Gen9CommandEncodeTest = testing::Test; +GEN9TEST_F(Gen9CommandEncodeTest, givenGen9PlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) { + EXPECT_TRUE(EncodeSurfaceState::doBindingTablePrefetch()); +}