diff --git a/runtime/command_stream/CMakeLists.txt b/runtime/command_stream/CMakeLists.txt index eb6640ffe8..ffcd82e1c2 100644 --- a/runtime/command_stream/CMakeLists.txt +++ b/runtime/command_stream/CMakeLists.txt @@ -32,6 +32,10 @@ set(RUNTIME_SRCS_COMMAND_STREAM ${CMAKE_CURRENT_SOURCE_DIR}/experimental_command_buffer.inl ${CMAKE_CURRENT_SOURCE_DIR}/linear_stream.cpp ${CMAKE_CURRENT_SOURCE_DIR}/linear_stream.h + ${CMAKE_CURRENT_SOURCE_DIR}/scratch_space_controller.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/scratch_space_controller.h + ${CMAKE_CURRENT_SOURCE_DIR}/scratch_space_controller_base.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/scratch_space_controller_base.h ${CMAKE_CURRENT_SOURCE_DIR}/submissions_aggregator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/submissions_aggregator.h ${CMAKE_CURRENT_SOURCE_DIR}/tbx_command_stream_receiver.cpp @@ -46,3 +50,4 @@ set(RUNTIME_SRCS_COMMAND_STREAM ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_COMMAND_STREAM}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_COMMAND_STREAM ${RUNTIME_SRCS_COMMAND_STREAM}) +add_subdirectories() \ No newline at end of file diff --git a/runtime/command_stream/aub_command_stream_receiver.cpp b/runtime/command_stream/aub_command_stream_receiver.cpp index 37795d021d..ba6a078c19 100644 --- a/runtime/command_stream/aub_command_stream_receiver.cpp +++ b/runtime/command_stream/aub_command_stream_receiver.cpp @@ -24,7 +24,8 @@ CommandStreamReceiver *AUBCommandStreamReceiver::create(const HardwareInfo &hwIn // Generate the full filename const auto >SystemInfo = *hwInfo.pSysInfo; std::stringstream strfilename; - strfilename << hwPrefix << "_" << gtSystemInfo.SliceCount << "x" << gtSystemInfo.SubSliceCount << "x" << gtSystemInfo.MaxEuPerSubSlice << "_" << baseName << ".aub"; + uint32_t subSlicesPerSlice = gtSystemInfo.SubSliceCount / gtSystemInfo.SliceCount; + strfilename << hwPrefix << "_" << gtSystemInfo.SliceCount << "x" << subSlicesPerSlice << "x" << gtSystemInfo.MaxEuPerSubSlice << "_" << baseName << ".aub"; // clean-up any fileName issues because of the file system incompatibilities auto fileName = strfilename.str(); diff --git a/runtime/command_stream/command_stream_receiver.cpp b/runtime/command_stream/command_stream_receiver.cpp index 620471929e..b4acf81af3 100644 --- a/runtime/command_stream/command_stream_receiver.cpp +++ b/runtime/command_stream/command_stream_receiver.cpp @@ -9,6 +9,7 @@ #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_stream/experimental_command_buffer.h" #include "runtime/command_stream/preemption.h" +#include "runtime/command_stream/scratch_space_controller.h" #include "runtime/device/device.h" #include "runtime/event/event.h" #include "runtime/gtpin/gtpin_notify.h" @@ -157,11 +158,6 @@ void CommandStreamReceiver::cleanupResources() { waitForTaskCountAndCleanAllocationList(this->latestFlushedTaskCount, TEMPORARY_ALLOCATION); waitForTaskCountAndCleanAllocationList(this->latestFlushedTaskCount, REUSABLE_ALLOCATION); - if (scratchAllocation) { - getMemoryManager()->freeGraphicsMemory(scratchAllocation); - scratchAllocation = nullptr; - } - if (debugSurface) { getMemoryManager()->freeGraphicsMemory(debugSurface); debugSurface = nullptr; @@ -217,6 +213,10 @@ void CommandStreamReceiver::setRequiredScratchSize(uint32_t newRequiredScratchSi } } +GraphicsAllocation *CommandStreamReceiver::getScratchAllocation() { + return scratchSpaceController->getScratchSpaceAllocation(); +} + void CommandStreamReceiver::initProgrammingFlags() { isPreambleSent = false; GSBAFor32BitProgrammed = false; @@ -310,6 +310,7 @@ void CommandStreamReceiver::allocateHeapMemory(IndirectHeap::Type heapType, indirectHeap = new IndirectHeap(heapMemory, requireInternalHeap); indirectHeap->overrideMaxSize(finalHeapSize); } + scratchSpaceController->reserveHeap(heapType, indirectHeap); } void CommandStreamReceiver::releaseIndirectHeap(IndirectHeap::Type heapType) { diff --git a/runtime/command_stream/command_stream_receiver.h b/runtime/command_stream/command_stream_receiver.h index cd4adc1839..631ea27448 100644 --- a/runtime/command_stream/command_stream_receiver.h +++ b/runtime/command_stream/command_stream_receiver.h @@ -35,6 +35,7 @@ class LinearStream; class MemoryManager; class OsContext; class OSInterface; +class ScratchSpaceController; class TimestampPacket; struct HwPerfCounter; struct HwTimeStamps; @@ -116,7 +117,7 @@ class CommandStreamReceiver { virtual void overrideMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; } void setRequiredScratchSize(uint32_t newRequiredScratchSize); - GraphicsAllocation *getScratchAllocation() const { return scratchAllocation; } + GraphicsAllocation *getScratchAllocation(); GraphicsAllocation *getDebugSurfaceAllocation() const { return debugSurface; } GraphicsAllocation *allocateDebugSurface(size_t size); @@ -180,6 +181,7 @@ class CommandStreamReceiver { std::unique_ptr experimentalCmdBuffer; std::unique_ptr internalAllocationStorage; std::unique_ptr kmdNotifyHelper; + std::unique_ptr scratchSpaceController; std::unique_ptr> profilingTimeStampAllocator; std::unique_ptr> perfCounterAllocator; std::unique_ptr> timestampPacketAllocator; @@ -194,7 +196,6 @@ class CommandStreamReceiver { volatile uint32_t *tagAddress = nullptr; GraphicsAllocation *tagAllocation = nullptr; - GraphicsAllocation *scratchAllocation = nullptr; GraphicsAllocation *preemptionCsrAllocation = nullptr; GraphicsAllocation *debugSurface = nullptr; OSInterface *osInterface = nullptr; diff --git a/runtime/command_stream/command_stream_receiver_hw.h b/runtime/command_stream/command_stream_receiver_hw.h index 5d49a67481..56bb47c71f 100644 --- a/runtime/command_stream/command_stream_receiver_hw.h +++ b/runtime/command_stream/command_stream_receiver_hw.h @@ -81,7 +81,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void handleEventsTimestampPacketTags(LinearStream &csr, DispatchFlags &dispatchFlags, Device ¤tDevice); virtual void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags); virtual void initPageTableManagerRegisters(LinearStream &csr){}; - void createScratchSpaceAllocation(size_t requiredScratchSizeInBytes); void addPipeControlWA(LinearStream &commandStream, bool flushDC); void addDcFlushToPipeControl(typename GfxFamily::PIPE_CONTROL *pCmd, bool flushDC); @@ -90,6 +89,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getSshHeapSize(); uint64_t getScratchPatchAddress(); + void createScratchSpaceController(const HardwareInfo &hwInfoIn); static void emitNoop(LinearStream &commandStream, size_t bytesToUpdate); diff --git a/runtime/command_stream/command_stream_receiver_hw.inl b/runtime/command_stream/command_stream_receiver_hw.inl index 28e95754f5..a3a9150c0a 100644 --- a/runtime/command_stream/command_stream_receiver_hw.inl +++ b/runtime/command_stream/command_stream_receiver_hw.inl @@ -8,6 +8,7 @@ #include "runtime/command_stream/command_stream_receiver_hw.h" #include "runtime/command_stream/experimental_command_buffer.h" #include "runtime/command_stream/linear_stream.h" +#include "runtime/command_stream/scratch_space_controller_base.h" #include "runtime/device/device.h" #include "runtime/event/event.h" #include "runtime/gtpin/gtpin_notify.h" @@ -51,6 +52,7 @@ CommandStreamReceiverHw::CommandStreamReceiverHw(const HardwareInfo & if (DebugManager.flags.EnableTimestampPacket.get() != -1) { timestampPacketWriteEnabled = !!DebugManager.flags.EnableTimestampPacket.get(); } + createScratchSpaceController(hwInfoIn); } template @@ -237,22 +239,21 @@ CompletionStamp CommandStreamReceiverHw::flushTask( csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired; csrSizeRequestFlags.specialPipelineSelectModeChanged = this->lastSpecialPipelineSelectMode != dispatchFlags.specialPipelineSelectMode; - size_t requiredScratchSizeInBytes = requiredScratchSize * device.getDeviceInfo().computeUnitsUsedForScratch; - auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations(); - bool stateBaseAddressDirty = false; - if (requiredScratchSize && (!scratchAllocation || scratchAllocation->getUnderlyingBufferSize() < requiredScratchSizeInBytes)) { - if (scratchAllocation) { - scratchAllocation->updateTaskCount(this->taskCount, this->deviceIndex); - internalAllocationStorage->storeAllocation(std::unique_ptr(scratchAllocation), TEMPORARY_ALLOCATION); - } - createScratchSpaceAllocation(requiredScratchSizeInBytes); - overrideMediaVFEStateDirty(true); - if (is64bit && !force32BitAllocations) { - stateBaseAddressDirty = true; + bool checkVfeStateDirty = false; + if (requiredScratchSize) { + scratchSpaceController->setRequiredScratchSpace(ssh.getCpuBase(), + requiredScratchSize, + this->taskCount, + this->deviceIndex, + stateBaseAddressDirty, + checkVfeStateDirty); + if (checkVfeStateDirty) { + overrideMediaVFEStateDirty(true); } + makeResident(*scratchSpaceController->getScratchSpaceAllocation()); } auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device)); @@ -308,8 +309,8 @@ CompletionStamp CommandStreamReceiverHw::flushTask( uint64_t newGSHbase = 0; GSBAFor32BitProgrammed = false; - if (is64bit && scratchAllocation && !force32BitAllocations) { - newGSHbase = (uint64_t)scratchAllocation->getUnderlyingBuffer() - PreambleHelper::getScratchSpaceOffsetFor64bit(); + if (is64bit && scratchSpaceController->getScratchSpaceAllocation() && !force32BitAllocations) { + newGSHbase = scratchSpaceController->calculateNewGSH(); } else if (is64bit && force32BitAllocations && dispatchFlags.GSBA32BitRequired) { newGSHbase = getMemoryManager()->allocator32Bit->getBase(); GSBAFor32BitProgrammed = true; @@ -381,9 +382,6 @@ CompletionStamp CommandStreamReceiverHw::flushTask( this->makeResident(*tagAllocation); - if (requiredScratchSize) - makeResident(*scratchAllocation); - if (preemptionCsrAllocation) makeResident(*preemptionCsrAllocation); @@ -608,22 +606,6 @@ void CommandStreamReceiverHw::addPipeControl(LinearStream &commandStr } } -template -uint64_t CommandStreamReceiverHw::getScratchPatchAddress() { - //for 32 bit scratch space pointer is being programmed in Media VFE State and is relative to 0 as General State Base Address - //for 64 bit, scratch space pointer is being programmed as "General State Base Address - scratchSpaceOffsetFor64bit" - // and "0 + scratchSpaceOffsetFor64bit" is being programmed in Media VFE state - - uint64_t scratchAddress = 0; - if (requiredScratchSize) { - scratchAddress = scratchAllocation->getGpuAddressToPatch(); - if (is64bit && !getMemoryManager()->peekForce32BitAllocations()) { - //this is to avoid scractch allocation offset "0" - scratchAddress = PreambleHelper::getScratchSpaceOffsetFor64bit(); - } - } - return scratchAddress; -} template size_t CommandStreamReceiverHw::getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device) { size_t size = getRequiredCmdStreamSize(dispatchFlags, device); @@ -821,7 +803,12 @@ void CommandStreamReceiverHw::handleEventsTimestampPacketTags(LinearS } template -void CommandStreamReceiverHw::createScratchSpaceAllocation(size_t requiredScratchSizeInBytes) { - scratchAllocation = getMemoryManager()->allocateGraphicsMemoryInPreferredPool(AllocationFlags(true), 0, nullptr, requiredScratchSizeInBytes, GraphicsAllocation::AllocationType::SCRATCH_SURFACE); +void CommandStreamReceiverHw::createScratchSpaceController(const HardwareInfo &hwInfoIn) { + scratchSpaceController = std::make_unique(hwInfoIn, executionEnvironment, *internalAllocationStorage.get()); +} + +template +uint64_t CommandStreamReceiverHw::getScratchPatchAddress() { + return scratchSpaceController->getScratchPatchAddress(); } } // namespace OCLRT diff --git a/runtime/command_stream/scratch_space_controller.cpp b/runtime/command_stream/scratch_space_controller.cpp new file mode 100644 index 0000000000..7c359681e7 --- /dev/null +++ b/runtime/command_stream/scratch_space_controller.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/command_stream/scratch_space_controller.h" +#include "runtime/execution_environment/execution_environment.h" +#include "runtime/helpers/hw_helper.h" +#include "runtime/memory_manager/graphics_allocation.h" +#include "runtime/memory_manager/internal_allocation_storage.h" +#include "runtime/memory_manager/memory_manager.h" + +namespace OCLRT { +ScratchSpaceController::ScratchSpaceController(const HardwareInfo &info, ExecutionEnvironment &environment, InternalAllocationStorage &allocationStorage) + : hwInfo(info), executionEnvironment(environment), csrAllocationStorage(allocationStorage) { + auto &hwHelper = HwHelper::get(info.pPlatform->eRenderCoreFamily); + computeUnitsUsedForScratch = hwHelper.getComputeUnitsUsedForScratch(&hwInfo); +} + +ScratchSpaceController::~ScratchSpaceController() { + if (scratchAllocation) { + getMemoryManager()->freeGraphicsMemory(scratchAllocation); + } +} + +MemoryManager *ScratchSpaceController::getMemoryManager() const { + UNRECOVERABLE_IF(executionEnvironment.memoryManager.get() == nullptr); + return executionEnvironment.memoryManager.get(); +} +} // namespace OCLRT diff --git a/runtime/command_stream/scratch_space_controller.h b/runtime/command_stream/scratch_space_controller.h new file mode 100644 index 0000000000..59d86c8569 --- /dev/null +++ b/runtime/command_stream/scratch_space_controller.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "runtime/indirect_heap/indirect_heap.h" +#include +#include + +namespace OCLRT { + +class Device; +class ExecutionEnvironment; +class GraphicsAllocation; +class InternalAllocationStorage; +class MemoryManager; +struct HardwareInfo; + +class ScratchSpaceController { + public: + ScratchSpaceController(const HardwareInfo &info, ExecutionEnvironment &environment, InternalAllocationStorage &allocationStorage); + virtual ~ScratchSpaceController(); + + GraphicsAllocation *getScratchSpaceAllocation() { + return scratchAllocation; + } + virtual void setRequiredScratchSpace(void *sshBaseAddress, + uint32_t requiredPerThreadScratchSize, + uint32_t currentTaskCount, + uint32_t deviceIdx, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) = 0; + virtual uint64_t calculateNewGSH() = 0; + virtual uint64_t getScratchPatchAddress() = 0; + + virtual void reserveHeap(IndirectHeap::Type heapType, IndirectHeap *&indirectHeap) = 0; + + protected: + MemoryManager *getMemoryManager() const; + + const HardwareInfo &hwInfo; + ExecutionEnvironment &executionEnvironment; + GraphicsAllocation *scratchAllocation = nullptr; + InternalAllocationStorage &csrAllocationStorage; + size_t scratchSizeBytes = 0; + bool force32BitAllocation = false; + uint32_t computeUnitsUsedForScratch = 0; +}; +} // namespace OCLRT diff --git a/runtime/command_stream/scratch_space_controller_base.cpp b/runtime/command_stream/scratch_space_controller_base.cpp new file mode 100644 index 0000000000..a6108c3c0d --- /dev/null +++ b/runtime/command_stream/scratch_space_controller_base.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/command_stream/scratch_space_controller_base.h" +#include "runtime/helpers/aligned_memory.h" +#include "runtime/helpers/hw_helper.h" +#include "runtime/helpers/preamble.h" +#include "runtime/memory_manager/memory_constants.h" +#include "runtime/memory_manager/graphics_allocation.h" +#include "runtime/memory_manager/internal_allocation_storage.h" +#include "runtime/memory_manager/memory_manager.h" + +namespace OCLRT { +ScratchSpaceControllerBase::ScratchSpaceControllerBase(const HardwareInfo &info, ExecutionEnvironment &environment, InternalAllocationStorage &allocationStorage) + : ScratchSpaceController(info, environment, allocationStorage) { +} + +void ScratchSpaceControllerBase::setRequiredScratchSpace(void *sshBaseAddress, + uint32_t requiredPerThreadScratchSize, + uint32_t currentTaskCount, + uint32_t deviceIdx, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) { + size_t requiredScratchSizeInBytes = requiredPerThreadScratchSize * computeUnitsUsedForScratch; + if (requiredScratchSizeInBytes && (!scratchAllocation || scratchSizeBytes < requiredScratchSizeInBytes)) { + if (scratchAllocation) { + scratchAllocation->updateTaskCount(currentTaskCount, deviceIdx); + csrAllocationStorage.storeAllocation(std::unique_ptr(scratchAllocation), TEMPORARY_ALLOCATION); + } + scratchSizeBytes = requiredScratchSizeInBytes; + createScratchSpaceAllocation(); + vfeStateDirty = true; + force32BitAllocation = getMemoryManager()->peekForce32BitAllocations(); + if (is64bit && !force32BitAllocation) { + stateBaseAddressDirty = true; + } + } +} + +void ScratchSpaceControllerBase::createScratchSpaceAllocation() { + scratchAllocation = getMemoryManager()->allocateGraphicsMemoryInPreferredPool(AllocationFlags(true), 0, nullptr, scratchSizeBytes, GraphicsAllocation::AllocationType::SCRATCH_SURFACE); + UNRECOVERABLE_IF(scratchAllocation == nullptr); +} + +uint64_t ScratchSpaceControllerBase::calculateNewGSH() { + auto &hwHelper = HwHelper::get(hwInfo.pPlatform->eRenderCoreFamily); + auto scratchSpaceOffsetFor64bit = hwHelper.getScratchSpaceOffsetFor64bit(); + return reinterpret_cast(scratchAllocation->getUnderlyingBuffer()) - scratchSpaceOffsetFor64bit; +} +uint64_t ScratchSpaceControllerBase::getScratchPatchAddress() { + //for 32 bit scratch space pointer is being programmed in Media VFE State and is relative to 0 as General State Base Address + //for 64 bit, scratch space pointer is being programmed as "General State Base Address - scratchSpaceOffsetFor64bit" + // and "0 + scratchSpaceOffsetFor64bit" is being programmed in Media VFE state + uint64_t scratchAddress = 0; + if (scratchAllocation) { + scratchAddress = scratchAllocation->getGpuAddressToPatch(); + if (is64bit && !getMemoryManager()->peekForce32BitAllocations()) { + auto &hwHelper = HwHelper::get(hwInfo.pPlatform->eRenderCoreFamily); + auto scratchSpaceOffsetFor64bit = hwHelper.getScratchSpaceOffsetFor64bit(); + //this is to avoid scractch allocation offset "0" + scratchAddress = scratchSpaceOffsetFor64bit; + } + } + return scratchAddress; +} + +void ScratchSpaceControllerBase::reserveHeap(IndirectHeap::Type heapType, IndirectHeap *&indirectHeap) { +} + +} // namespace OCLRT diff --git a/runtime/command_stream/scratch_space_controller_base.h b/runtime/command_stream/scratch_space_controller_base.h new file mode 100644 index 0000000000..e642092ee5 --- /dev/null +++ b/runtime/command_stream/scratch_space_controller_base.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "runtime/command_stream/scratch_space_controller.h" + +namespace OCLRT { + +class ScratchSpaceControllerBase : public ScratchSpaceController { + public: + ScratchSpaceControllerBase(const HardwareInfo &info, ExecutionEnvironment &environment, InternalAllocationStorage &allocationStorage); + + void setRequiredScratchSpace(void *sshBaseAddress, + uint32_t requiredPerThreadScratchSize, + uint32_t currentTaskCount, + uint32_t deviceIdx, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) override; + uint64_t calculateNewGSH() override; + uint64_t getScratchPatchAddress() override; + + void reserveHeap(IndirectHeap::Type heapType, IndirectHeap *&indirectHeap) override; + + protected: + void createScratchSpaceAllocation(); +}; +} // namespace OCLRT diff --git a/runtime/gen10/command_stream_receiver_hw_gen10.cpp b/runtime/gen10/command_stream_receiver_hw_gen10.cpp index 5d28fa1d4f..2584dbb3c1 100644 --- a/runtime/gen10/command_stream_receiver_hw_gen10.cpp +++ b/runtime/gen10/command_stream_receiver_hw_gen10.cpp @@ -66,4 +66,5 @@ const Family::MI_BATCH_BUFFER_START Family::cmdInitBatchBufferStart = Family::MI const Family::MI_BATCH_BUFFER_END Family::cmdInitBatchBufferEnd = Family::MI_BATCH_BUFFER_END::sInit(); const Family::PIPE_CONTROL Family::cmdInitPipeControl = Family::PIPE_CONTROL::sInit(); const Family::MI_SEMAPHORE_WAIT Family::cmdInitMiSemaphoreWait = Family::MI_SEMAPHORE_WAIT::sInit(); +const Family::RENDER_SURFACE_STATE Family::cmdRenderSurfaceState = Family::RENDER_SURFACE_STATE::sInit(); } // namespace OCLRT diff --git a/runtime/gen10/hw_cmds.h b/runtime/gen10/hw_cmds.h index 713ca9edf3..cadee83dff 100644 --- a/runtime/gen10/hw_cmds.h +++ b/runtime/gen10/hw_cmds.h @@ -34,6 +34,7 @@ struct CNLFamily : public GEN10 { static const MI_BATCH_BUFFER_START cmdInitBatchBufferStart; static const PIPE_CONTROL cmdInitPipeControl; static const MI_SEMAPHORE_WAIT cmdInitMiSemaphoreWait; + static const RENDER_SURFACE_STATE cmdRenderSurfaceState; static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) { return cmdSetBaseFamily == IGFX_GEN8_CORE; diff --git a/runtime/gen10/hw_info_cnl.inl b/runtime/gen10/hw_info_cnl.inl index 34e6da940f..7e36f9666d 100644 --- a/runtime/gen10/hw_info_cnl.inl +++ b/runtime/gen10/hw_info_cnl.inl @@ -77,7 +77,7 @@ void CNL_2x5x8::setupHardwareInfo(GT_SYSTEM_INFO *gtSysInfo, FeatureTable *featu gtSysInfo->EUCount = 39; gtSysInfo->ThreadCount = 39 * CNL::threadsPerEu; gtSysInfo->SliceCount = 2; - gtSysInfo->SubSliceCount = 5; + gtSysInfo->SubSliceCount = 10; gtSysInfo->L3CacheSizeInKb = 1536; gtSysInfo->L3BankCount = 6; gtSysInfo->MaxFillRate = 16; @@ -105,7 +105,7 @@ void CNL_2x4x8::setupHardwareInfo(GT_SYSTEM_INFO *gtSysInfo, FeatureTable *featu gtSysInfo->EUCount = 31; gtSysInfo->ThreadCount = 31 * CNL::threadsPerEu; gtSysInfo->SliceCount = 2; - gtSysInfo->SubSliceCount = 4; + gtSysInfo->SubSliceCount = 8; gtSysInfo->L3CacheSizeInKb = 1536; gtSysInfo->L3BankCount = 6; gtSysInfo->MaxFillRate = 16; @@ -189,7 +189,7 @@ void CNL_4x9x8::setupHardwareInfo(GT_SYSTEM_INFO *gtSysInfo, FeatureTable *featu gtSysInfo->EUCount = 71; gtSysInfo->ThreadCount = 71 * CNL::threadsPerEu; gtSysInfo->SliceCount = 4; - gtSysInfo->SubSliceCount = 9; + gtSysInfo->SubSliceCount = 36; gtSysInfo->L3CacheSizeInKb = 1536; gtSysInfo->L3BankCount = 6; gtSysInfo->MaxFillRate = 16; diff --git a/runtime/gen8/command_stream_receiver_hw_gen8.cpp b/runtime/gen8/command_stream_receiver_hw_gen8.cpp index 9ec8ac80fd..327fdf2c48 100644 --- a/runtime/gen8/command_stream_receiver_hw_gen8.cpp +++ b/runtime/gen8/command_stream_receiver_hw_gen8.cpp @@ -59,4 +59,5 @@ const Family::MI_BATCH_BUFFER_START Family::cmdInitBatchBufferStart = Family::MI const Family::MI_BATCH_BUFFER_END Family::cmdInitBatchBufferEnd = Family::MI_BATCH_BUFFER_END::sInit(); const Family::PIPE_CONTROL Family::cmdInitPipeControl = Family::PIPE_CONTROL::sInit(); const Family::MI_SEMAPHORE_WAIT Family::cmdInitMiSemaphoreWait = Family::MI_SEMAPHORE_WAIT::sInit(); +const Family::RENDER_SURFACE_STATE Family::cmdRenderSurfaceState = Family::RENDER_SURFACE_STATE::sInit(); } // namespace OCLRT diff --git a/runtime/gen8/hw_cmds_base.h b/runtime/gen8/hw_cmds_base.h index 1f97dcb394..0657bc3be2 100644 --- a/runtime/gen8/hw_cmds_base.h +++ b/runtime/gen8/hw_cmds_base.h @@ -33,6 +33,7 @@ struct BDWFamily : public GEN8 { static const MI_BATCH_BUFFER_START cmdInitBatchBufferStart; static const PIPE_CONTROL cmdInitPipeControl; static const MI_SEMAPHORE_WAIT cmdInitMiSemaphoreWait; + static const RENDER_SURFACE_STATE cmdRenderSurfaceState; static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) { return cmdSetBaseFamily == IGFX_GEN8_CORE; diff --git a/runtime/gen9/command_stream_receiver_hw_gen9.cpp b/runtime/gen9/command_stream_receiver_hw_gen9.cpp index 42fe9a6f34..14ab80cfdc 100644 --- a/runtime/gen9/command_stream_receiver_hw_gen9.cpp +++ b/runtime/gen9/command_stream_receiver_hw_gen9.cpp @@ -58,4 +58,5 @@ const Family::MI_BATCH_BUFFER_START Family::cmdInitBatchBufferStart = Family::MI const Family::MI_BATCH_BUFFER_END Family::cmdInitBatchBufferEnd = Family::MI_BATCH_BUFFER_END::sInit(); const Family::PIPE_CONTROL Family::cmdInitPipeControl = Family::PIPE_CONTROL::sInit(); const Family::MI_SEMAPHORE_WAIT Family::cmdInitMiSemaphoreWait = Family::MI_SEMAPHORE_WAIT::sInit(); +const Family::RENDER_SURFACE_STATE Family::cmdRenderSurfaceState = Family::RENDER_SURFACE_STATE::sInit(); } // namespace OCLRT diff --git a/runtime/gen9/hw_cmds_base.h b/runtime/gen9/hw_cmds_base.h index d3ab4d3c4f..07f5f305c4 100644 --- a/runtime/gen9/hw_cmds_base.h +++ b/runtime/gen9/hw_cmds_base.h @@ -34,6 +34,7 @@ struct SKLFamily : public GEN9 { static const MI_BATCH_BUFFER_START cmdInitBatchBufferStart; static const PIPE_CONTROL cmdInitPipeControl; static const MI_SEMAPHORE_WAIT cmdInitMiSemaphoreWait; + static const RENDER_SURFACE_STATE cmdRenderSurfaceState; static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) { return cmdSetBaseFamily == IGFX_GEN8_CORE; diff --git a/runtime/helpers/hw_helper.h b/runtime/helpers/hw_helper.h index 05500973ac..25fdaf8cbe 100644 --- a/runtime/helpers/hw_helper.h +++ b/runtime/helpers/hw_helper.h @@ -6,6 +6,7 @@ */ #pragma once +#include "runtime/api/cl_types.h" #include "runtime/gen_common/aub_mapper.h" #include "runtime/gen_common/hw_cmds.h" #include "runtime/command_stream/linear_stream.h" @@ -15,6 +16,8 @@ #include namespace OCLRT { +class ExecutionEnvironment; +class GraphicsAllocation; struct HardwareCapabilities; class HwHelper { @@ -37,6 +40,18 @@ class HwHelper { virtual const AubMemDump::LrcaHelper &getCsTraits(EngineInstanceT engineInstance) const = 0; virtual bool supportsYTiling() const = 0; virtual bool timestampPacketWriteSupported() const = 0; + virtual size_t getRenderSurfaceStateSize() const = 0; + virtual void setRenderSurfaceStateForBuffer(ExecutionEnvironment &executionEnvironment, + void *surfaceStateBuffer, + size_t bufferSize, + uint64_t gpuVa, + size_t offset, + uint32_t pitch, + GraphicsAllocation *gfxAlloc, + cl_mem_flags flags, + uint32_t surfaceType, + bool forceNonAuxMode) = 0; + virtual size_t getScratchSpaceOffsetFor64bit() = 0; protected: HwHelper() = default; @@ -72,6 +87,11 @@ class HwHelperHw : public HwHelper { return sizeof(INTERFACE_DESCRIPTOR_DATA); } + size_t getRenderSurfaceStateSize() const override { + using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; + return sizeof(RENDER_SURFACE_STATE); + } + const AubMemDump::LrcaHelper &getCsTraits(EngineInstanceT engineInstance) const override; size_t getMaxBarrierRegisterPerSlice() const override; @@ -98,6 +118,19 @@ class HwHelperHw : public HwHelper { bool isPageTableManagerSupported(const HardwareInfo &hwInfo) const override; + void setRenderSurfaceStateForBuffer(ExecutionEnvironment &executionEnvironment, + void *surfaceStateBuffer, + size_t bufferSize, + uint64_t gpuVa, + size_t offset, + uint32_t pitch, + GraphicsAllocation *gfxAlloc, + cl_mem_flags flags, + uint32_t surfaceType, + bool forceNonAuxMode) override; + + size_t getScratchSpaceOffsetFor64bit() override; + protected: HwHelperHw() = default; }; @@ -149,4 +182,13 @@ struct PipeControlHelper { } }; +union SURFACE_STATE_BUFFER_LENGTH { + uint32_t Length; + struct SurfaceState { + uint32_t Width : BITFIELD_RANGE(0, 6); + uint32_t Height : BITFIELD_RANGE(7, 20); + uint32_t Depth : BITFIELD_RANGE(21, 31); + } SurfaceState; +}; + } // namespace OCLRT diff --git a/runtime/helpers/hw_helper_common.inl b/runtime/helpers/hw_helper_common.inl index 106eb93f99..d650690894 100644 --- a/runtime/helpers/hw_helper_common.inl +++ b/runtime/helpers/hw_helper_common.inl @@ -6,11 +6,17 @@ */ #include "runtime/aub_mem_dump/aub_mem_dump.h" +#include "runtime/execution_environment/execution_environment.h" +#include "runtime/gmm_helper/gmm.h" +#include "runtime/gmm_helper/gmm_helper.h" +#include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/hw_helper.h" #include "runtime/helpers/hw_info.h" +#include "runtime/memory_manager/graphics_allocation.h" #include "runtime/memory_manager/memory_constants.h" namespace OCLRT { + template void HwHelperHw::setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) { coherencyFlag = true; @@ -84,4 +90,76 @@ template bool HwHelperHw::timestampPacketWriteSupported() const { return false; } + +template +void HwHelperHw::setRenderSurfaceStateForBuffer(ExecutionEnvironment &executionEnvironment, + void *surfaceStateBuffer, + size_t bufferSize, + uint64_t gpuVa, + size_t offset, + uint32_t pitch, + GraphicsAllocation *gfxAlloc, + cl_mem_flags flags, + uint32_t surfaceType, + bool forceNonAuxMode) { + using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; + using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + auto gmmHelper = executionEnvironment.getGmmHelper(); + auto surfaceState = reinterpret_cast(surfaceStateBuffer); + *surfaceState = Family::cmdRenderSurfaceState; + auto surfaceSize = alignUp(bufferSize, 4); + + SURFACE_STATE_BUFFER_LENGTH Length = {0}; + Length.Length = static_cast(surfaceSize - 1); + + surfaceState->setWidth(Length.SurfaceState.Width + 1); + surfaceState->setHeight(Length.SurfaceState.Height + 1); + surfaceState->setDepth(Length.SurfaceState.Depth + 1); + if (pitch) { + surfaceState->setSurfacePitch(pitch); + } + + // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address + auto bufferStateAddress = (gfxAlloc != nullptr) ? gfxAlloc->getGpuAddress() : gpuVa; + bufferStateAddress += offset; + + auto bufferStateSize = (gfxAlloc != nullptr) ? gfxAlloc->getUnderlyingBufferSize() : bufferSize; + + surfaceState->setSurfaceType(static_cast(surfaceType)); + + surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); + surfaceState->setSurfaceVerticalAlignment(RENDER_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); + surfaceState->setSurfaceHorizontalAlignment(RENDER_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); + + surfaceState->setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); + surfaceState->setVerticalLineStride(0); + surfaceState->setVerticalLineStrideOffset(0); + if ((isAligned(bufferStateAddress) && isAligned(bufferStateSize)) || + ((flags & CL_MEM_READ_ONLY)) != 0) { + surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)); + } else { + surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); + } + + surfaceState->setSurfaceBaseAddress(bufferStateAddress); + + Gmm *gmm = gfxAlloc ? gfxAlloc->gmm : nullptr; + if (gmm && gmm->isRenderCompressed && !forceNonAuxMode && + GraphicsAllocation::AllocationType::BUFFER_COMPRESSED == gfxAlloc->getAllocationType()) { + // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios + surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); + surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); + } else { + surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); + surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); + } +} + +template +size_t HwHelperHw::getScratchSpaceOffsetFor64bit() { + return 4096; +} + } // namespace OCLRT diff --git a/runtime/helpers/preamble.h b/runtime/helpers/preamble.h index d2c5358efe..ffeeaecc2d 100644 --- a/runtime/helpers/preamble.h +++ b/runtime/helpers/preamble.h @@ -24,8 +24,6 @@ struct PreambleHelper { using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - static constexpr size_t getScratchSpaceOffsetFor64bit() { return 4096; } - static void programL3(LinearStream *pCommandStream, uint32_t l3Config); static void programPipelineSelect(LinearStream *pCommandStream, const DispatchFlags &dispatchFlags); static uint32_t getDefaultThreadArbitrationPolicy(); diff --git a/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp b/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp index d73051bd2b..f9db70c47f 100644 --- a/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp +++ b/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp @@ -380,12 +380,7 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixturesetArgSvm(1, sizeUserMemory, destMemory); - outBuffer = csr->getMemoryManager()->allocateGraphicsMemory(sizeUserMemory, destMemory); - csr->makeResidentHostPtrAllocation(outBuffer); - csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr(outBuffer), TEMPORARY_ALLOCATION); - ASSERT_NE(nullptr, outBuffer); - outBuffer->setAllocationType(GraphicsAllocation::AllocationType::BUFFER); - outBuffer->setMemObjectsAllocationWithWritableFlags(true); + outBuffer = createHostPtrAllocationFromSvmPtr(destMemory, sizeUserMemory); } void initializeExpectedMemory(size_t globalX, size_t globalY, size_t globalZ) { diff --git a/unit_tests/aub_tests/command_stream/aub_mem_dump_tests.cpp b/unit_tests/aub_tests/command_stream/aub_mem_dump_tests.cpp index d990b7f919..6b49c5d516 100644 --- a/unit_tests/aub_tests/command_stream/aub_mem_dump_tests.cpp +++ b/unit_tests/aub_tests/command_stream/aub_mem_dump_tests.cpp @@ -22,7 +22,8 @@ using OCLRT::folderAUB; std::string getAubFileName(const OCLRT::Device *pDevice, const std::string baseName) { const auto pGtSystemInfo = pDevice->getHardwareInfo().pSysInfo; std::stringstream strfilename; - strfilename << pDevice->getProductAbbrev() << "_" << pGtSystemInfo->SliceCount << "x" << pGtSystemInfo->SubSliceCount << "x" << pGtSystemInfo->MaxEuPerSubSlice << "_" << baseName; + uint32_t subSlicesPerSlice = pGtSystemInfo->SubSliceCount / pGtSystemInfo->SliceCount; + strfilename << pDevice->getProductAbbrev() << "_" << pGtSystemInfo->SliceCount << "x" << subSlicesPerSlice << "x" << pGtSystemInfo->MaxEuPerSubSlice << "_" << baseName; return strfilename.str(); } diff --git a/unit_tests/aub_tests/fixtures/CMakeLists.txt b/unit_tests/aub_tests/fixtures/CMakeLists.txt index 38130377c0..529b40e47c 100644 --- a/unit_tests/aub_tests/fixtures/CMakeLists.txt +++ b/unit_tests/aub_tests/fixtures/CMakeLists.txt @@ -6,6 +6,7 @@ target_sources(igdrcl_aub_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/aub_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/aub_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/aub_parent_kernel_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/fixture_tests.cpp diff --git a/unit_tests/aub_tests/fixtures/aub_fixture.cpp b/unit_tests/aub_tests/fixtures/aub_fixture.cpp new file mode 100644 index 0000000000..4a788f639f --- /dev/null +++ b/unit_tests/aub_tests/fixtures/aub_fixture.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/memory_manager/graphics_allocation.h" +#include "runtime/memory_manager/internal_allocation_storage.h" +#include "runtime/memory_manager/memory_manager.h" +#include "unit_tests/aub_tests/fixtures/aub_fixture.h" + +namespace OCLRT { + +GraphicsAllocation *AUBFixture::createHostPtrAllocationFromSvmPtr(void *svmPtr, size_t size) { + GraphicsAllocation *allocation = csr->getMemoryManager()->allocateGraphicsMemory(size, svmPtr); + csr->makeResidentHostPtrAllocation(allocation); + csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr(allocation), TEMPORARY_ALLOCATION); + allocation->setAllocationType(GraphicsAllocation::AllocationType::BUFFER); + allocation->setMemObjectsAllocationWithWritableFlags(true); + return allocation; +} + +} // namespace OCLRT diff --git a/unit_tests/aub_tests/fixtures/aub_fixture.h b/unit_tests/aub_tests/fixtures/aub_fixture.h index fa16069b29..38837872d4 100644 --- a/unit_tests/aub_tests/fixtures/aub_fixture.h +++ b/unit_tests/aub_tests/fixtures/aub_fixture.h @@ -53,6 +53,8 @@ class AUBFixture : public CommandQueueHwFixture { CommandQueueHwFixture::TearDown(); } + GraphicsAllocation *createHostPtrAllocationFromSvmPtr(void *svmPtr, size_t size); + template AUBCommandStreamReceiverHw *getAubCsr() { AUBCommandStreamReceiverHw *aubCsr = nullptr; diff --git a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp index bfbacb3790..848e0518b5 100644 --- a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp @@ -6,6 +6,7 @@ */ #include "reg_configs_common.h" +#include "runtime/helpers/hw_helper.h" #include "runtime/memory_manager/allocations_list.h" #include "unit_tests/command_queue/enqueue_fixture.h" #include "unit_tests/fixtures/hello_world_fixture.h" @@ -325,8 +326,8 @@ HWCMDTEST_P(IGFX_GEN8_CORE, EnqueueScratchSpaceTests, GivenKernelRequiringScratc EXPECT_NE(0u, cmd->getScratchSpaceBasePointer()); EXPECT_EQ(0u, GSHaddress); } else { - EXPECT_EQ(PreambleHelper::getScratchSpaceOffsetFor64bit(), cmd->getScratchSpaceBasePointer()); - EXPECT_EQ(GSHaddress + PreambleHelper::getScratchSpaceOffsetFor64bit(), (uintptr_t)graphicsAllocation->getUnderlyingBuffer()); + EXPECT_EQ(HwHelperHw::get().getScratchSpaceOffsetFor64bit(), cmd->getScratchSpaceBasePointer()); + EXPECT_EQ(GSHaddress + HwHelperHw::get().getScratchSpaceOffsetFor64bit(), (uintptr_t)graphicsAllocation->getUnderlyingBuffer()); } auto allocationSize = scratchSize * pDevice->getDeviceInfo().computeUnitsUsedForScratch; @@ -378,7 +379,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, EnqueueScratchSpaceTests, GivenKernelRequiringScratc auto *sba2 = (STATE_BASE_ADDRESS *)*itorCmdForStateBase; auto GSHaddress2 = sba2->getGeneralStateBaseAddress(); EXPECT_NE(0u, GSHaddress2); - EXPECT_EQ(PreambleHelper::getScratchSpaceOffsetFor64bit(), cmd2->getScratchSpaceBasePointer()); + EXPECT_EQ(HwHelperHw::get().getScratchSpaceOffsetFor64bit(), cmd2->getScratchSpaceBasePointer()); EXPECT_NE(GSHaddress2, GSHaddress); } EXPECT_EQ(graphicsAllocation->getUnderlyingBufferSize(), allocationSize); @@ -399,7 +400,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, EnqueueScratchSpaceTests, GivenKernelRequiringScratc if (is32bit) { EXPECT_EQ(0u, GSBaddress); } else if (is64bit) { - EXPECT_EQ((uintptr_t)graphicsAllocation2->getUnderlyingBuffer(), GSBaddress + PreambleHelper::getScratchSpaceOffsetFor64bit()); + EXPECT_EQ((uintptr_t)graphicsAllocation2->getUnderlyingBuffer(), GSBaddress + HwHelperHw::get().getScratchSpaceOffsetFor64bit()); } EXPECT_TRUE(csr.getAllocationsForReuse().peekIsEmpty()); diff --git a/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp index 44df001161..25e8f82585 100644 --- a/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp @@ -6,6 +6,7 @@ */ #include "reg_configs_common.h" +#include "runtime/helpers/hw_helper.h" #include "runtime/memory_manager/internal_allocation_storage.h" #include "runtime/os_interface/os_context.h" #include "test.h" @@ -498,7 +499,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenTwoConsecu uint64_t scratchBaseHighPart = (uint64_t)mediaVfeState->getScratchSpaceBasePointerHigh(); if (is64bit && !pDevice->getDeviceInfo().force32BitAddressess) { - uint64_t expectedAddress = PreambleHelper::getScratchSpaceOffsetFor64bit(); + uint64_t expectedAddress = HwHelperHw::get().getScratchSpaceOffsetFor64bit(); EXPECT_EQ(expectedAddress, scratchBaseLowPart); EXPECT_EQ(0u, scratchBaseHighPart); } else { @@ -510,7 +511,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenTwoConsecu EXPECT_EQ(pDevice->getMemoryManager()->allocator32Bit->getBase(), GSHaddress); } else { if (is64bit) { - EXPECT_EQ(graphicsAddress - PreambleHelper::getScratchSpaceOffsetFor64bit(), GSHaddress); + EXPECT_EQ(graphicsAddress - HwHelperHw::get().getScratchSpaceOffsetFor64bit(), GSHaddress); } else { EXPECT_EQ(0u, GSHaddress); } @@ -608,7 +609,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenNDRangeKer uint64_t scratchBaseHighPart = (uint64_t)mediaVfeState->getScratchSpaceBasePointerHigh(); if (is64bit && !pDevice->getDeviceInfo().force32BitAddressess) { - lowPartGraphicsAddress = PreambleHelper::getScratchSpaceOffsetFor64bit(); + lowPartGraphicsAddress = HwHelperHw::get().getScratchSpaceOffsetFor64bit(); highPartGraphicsAddress = 0u; } @@ -619,7 +620,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenNDRangeKer EXPECT_EQ(pDevice->getMemoryManager()->allocator32Bit->getBase(), GSHaddress); } else { if (is64bit) { - EXPECT_EQ(graphicsAddress - PreambleHelper::getScratchSpaceOffsetFor64bit(), GSHaddress); + EXPECT_EQ(graphicsAddress - HwHelperHw::get().getScratchSpaceOffsetFor64bit(), GSHaddress); } else { EXPECT_EQ(0u, GSHaddress); } diff --git a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp index 329bc51dd5..430b9727bf 100644 --- a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp @@ -10,6 +10,7 @@ #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_stream/linear_stream.h" +#include "runtime/command_stream/scratch_space_controller.h" #include "runtime/os_interface/debug_settings_manager.h" #include "runtime/event/user_event.h" #include "runtime/helpers/aligned_memory.h" @@ -210,3 +211,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverHwTest, WhenCommandStreamReceiv auto &commandStreamReceiver = pDevice->getCommandStreamReceiver(); EXPECT_EQ(64 * KB, commandStreamReceiver.defaultSshSize); } + +HWTEST_F(CommandStreamReceiverHwTest, WhenScratchSpaceIsNotRequiredThenScratchAllocationIsNotCreated) { + auto commandStreamReceiver = std::make_unique>(*platformDevices[0], *pDevice->executionEnvironment); + auto scratchController = commandStreamReceiver->scratchSpaceController.get(); + + bool stateBaseAddressDirty = false; + bool cfeStateDirty = false; + scratchController->setRequiredScratchSpace(reinterpret_cast(0x2000), 0u, 0u, 0u, stateBaseAddressDirty, cfeStateDirty); + EXPECT_FALSE(cfeStateDirty); + EXPECT_FALSE(stateBaseAddressDirty); + EXPECT_EQ(nullptr, scratchController->getScratchSpaceAllocation()); +} diff --git a/unit_tests/fixtures/simple_arg_kernel_fixture.h b/unit_tests/fixtures/simple_arg_kernel_fixture.h index ead39d7648..2508fa0a25 100644 --- a/unit_tests/fixtures/simple_arg_kernel_fixture.h +++ b/unit_tests/fixtures/simple_arg_kernel_fixture.h @@ -239,7 +239,7 @@ class SimpleKernelFixture : public ProgramFixture { size_t kernelsCount; cl_int retVal = CL_SUCCESS; - std::unique_ptr kernels[6] = {}; + std::unique_ptr kernels[8] = {}; uint32_t kernelIds = 0; }; diff --git a/unit_tests/fixtures/ult_command_stream_receiver_fixture.h b/unit_tests/fixtures/ult_command_stream_receiver_fixture.h index 2119f306ee..f1758e33ed 100644 --- a/unit_tests/fixtures/ult_command_stream_receiver_fixture.h +++ b/unit_tests/fixtures/ult_command_stream_receiver_fixture.h @@ -26,7 +26,7 @@ struct UltCommandStreamReceiverTest DeviceFixture::SetUp(); HardwareParse::SetUp(); - size_t sizeStream = 256; + size_t sizeStream = 512; size_t alignmentStream = 0x1000; cmdBuffer = alignedMalloc(sizeStream, alignmentStream); dshBuffer = alignedMalloc(sizeStream, alignmentStream); diff --git a/unit_tests/gen10/test_hw_info_config_cnl.cpp b/unit_tests/gen10/test_hw_info_config_cnl.cpp index 7158b91743..ed1ddbfea7 100644 --- a/unit_tests/gen10/test_hw_info_config_cnl.cpp +++ b/unit_tests/gen10/test_hw_info_config_cnl.cpp @@ -30,25 +30,25 @@ TEST(CnlHwInfoConfig, givenHwInfoConfigStringThenAfterSetupResultingHwInfoIsCorr strConfig = "2x4x8"; hardwareInfoSetup[productFamily](&gInfo, &fTable, false, strConfig); EXPECT_EQ(gInfo.SliceCount, 2u); - EXPECT_EQ(gInfo.SubSliceCount, 4u); + EXPECT_EQ(gInfo.SubSliceCount, 8u); EXPECT_EQ(gInfo.EUCount, 31u); strConfig = "2x5x8"; hardwareInfoSetup[productFamily](&gInfo, &fTable, false, strConfig); EXPECT_EQ(gInfo.SliceCount, 2u); - EXPECT_EQ(gInfo.SubSliceCount, 5u); + EXPECT_EQ(gInfo.SubSliceCount, 10u); EXPECT_EQ(gInfo.EUCount, 39u); strConfig = "4x9x8"; hardwareInfoSetup[productFamily](&gInfo, &fTable, false, strConfig); EXPECT_EQ(gInfo.SliceCount, 4u); - EXPECT_EQ(gInfo.SubSliceCount, 9u); + EXPECT_EQ(gInfo.SubSliceCount, 36u); EXPECT_EQ(gInfo.EUCount, 71u); strConfig = "default"; hardwareInfoSetup[productFamily](&gInfo, &fTable, false, strConfig); EXPECT_EQ(gInfo.SliceCount, 2u); - EXPECT_EQ(gInfo.SubSliceCount, 5u); + EXPECT_EQ(gInfo.SubSliceCount, 10u); EXPECT_EQ(gInfo.EUCount, 39u); strConfig = "erroneous"; diff --git a/unit_tests/helpers/hw_helper_tests.cpp b/unit_tests/helpers/hw_helper_tests.cpp index 617834e414..e79f59419a 100644 --- a/unit_tests/helpers/hw_helper_tests.cpp +++ b/unit_tests/helpers/hw_helper_tests.cpp @@ -5,14 +5,27 @@ * */ +#include "runtime/gmm_helper/gmm.h" +#include "runtime/gmm_helper/gmm_helper.h" +#include "runtime/gmm_helper/resource_info.h" +#include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/options.h" +#include "runtime/helpers/string.h" +#include "runtime/memory_manager/graphics_allocation.h" #include "unit_tests/helpers/debug_manager_state_restore.h" #include "unit_tests/helpers/hw_helper_tests.h" #include "unit_tests/helpers/unit_test_helper.h" -void HwHelperTest::SetUp() { +#include +#include +#include +#include + +void HwHelperFixture::SetUp() { + DeviceFixture::SetUp(); } -void HwHelperTest::TearDown() { +void HwHelperFixture::TearDown() { + DeviceFixture::TearDown(); } TEST_F(HwHelperTest, getReturnsValidHwHelperHw) { @@ -212,3 +225,336 @@ TEST(HwInfoTest, givenNodeOrdinalSetWhenChosenEngineTypeQueriedThenSetValueIsRet auto engineType = getChosenEngineType(hwInfo); EXPECT_EQ(EngineType::ENGINE_VECS, engineType); } + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenNoAllocationProvidedThenUseArgumentsasInput) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + auto gmmHelper = ee.getGmmHelper(); + + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + EXPECT_EQ(sizeof(RENDER_SURFACE_STATE), helper.getRenderSurfaceStateSize()); + + size_t size = 0x1000; + SURFACE_STATE_BUFFER_LENGTH length; + length.Length = static_cast(size - 1); + uint64_t addr = 0x2000; + size_t offset = 0x1000; + uint32_t pitch = 0x40; + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, offset, pitch, nullptr, 0, type, true); + + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, state->getHeight()); + EXPECT_EQ(pitch, state->getSurfacePitch()); + addr += offset; + EXPECT_EQ(addr, state->getSurfaceBaseAddress()); + EXPECT_EQ(type, state->getSurfaceType()); + EXPECT_EQ(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER), state->getMemoryObjectControlState()); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + size = 0x1003; + length.Length = static_cast(alignUp(size, 4) - 1); + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, nullptr, 0, type, true); + EXPECT_EQ(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED), state->getMemoryObjectControlState()); + EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, state->getHeight()); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + size = 0x1000; + addr = 0x2001; + length.Length = static_cast(size - 1); + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, nullptr, 0, type, true); + EXPECT_EQ(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED), state->getMemoryObjectControlState()); + EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, state->getHeight()); + EXPECT_EQ(addr, state->getSurfaceBaseAddress()); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + size = 0x1005; + length.Length = static_cast(alignUp(size, 4) - 1); + cl_mem_flags flags = CL_MEM_READ_ONLY; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, nullptr, flags, type, true); + EXPECT_EQ(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER), state->getMemoryObjectControlState()); + EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, state->getHeight()); + EXPECT_EQ(addr, state->getSurfaceBaseAddress()); + + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenAllocationProvidedThenUseAllocationAsInput) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + SURFACE_STATE_BUFFER_LENGTH length; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + + void *cpuAddr = reinterpret_cast(0x4000); + uint64_t gpuAddr = 0x4000u; + size_t allocSize = size; + length.Length = static_cast(allocSize - 1); + GraphicsAllocation allocation(cpuAddr, gpuAddr, 0u, allocSize, 0, false); + allocation.gmm = new Gmm(allocation.getUnderlyingBuffer(), allocation.getUnderlyingBufferSize(), false); + ASSERT_NE(nullptr, allocation.gmm); + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, &allocation, 0, type, true); + EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); + EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); + EXPECT_EQ(length.SurfaceState.Height + 1u, state->getHeight()); + EXPECT_EQ(pitch, state->getSurfacePitch() - 1u); + EXPECT_EQ(gpuAddr, state->getSurfaceBaseAddress()); + + EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT, state->getCoherencyType()); + EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, state->getAuxiliarySurfaceMode()); + + delete allocation.gmm; + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenGmmAndAllocationCompressionEnabledAnNonAuxDisabledThenSetCoherencyToGpuAndAuxModeToCompression) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + + void *cpuAddr = reinterpret_cast(0x4000); + uint64_t gpuAddr = 0x4000u; + size_t allocSize = size; + GraphicsAllocation allocation(cpuAddr, gpuAddr, 0u, allocSize, 0, false); + allocation.gmm = new Gmm(allocation.getUnderlyingBuffer(), allocation.getUnderlyingBufferSize(), false); + ASSERT_NE(nullptr, allocation.gmm); + allocation.gmm->isRenderCompressed = true; + allocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED); + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, &allocation, 0, type, false); + EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT, state->getCoherencyType()); + EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E, state->getAuxiliarySurfaceMode()); + + delete allocation.gmm; + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenGmmCompressionEnabledAndAllocationDisabledAnNonAuxDisabledThenSetCoherencyToIaAndAuxModeToNone) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + + void *cpuAddr = reinterpret_cast(0x4000); + uint64_t gpuAddr = 0x4000u; + size_t allocSize = size; + GraphicsAllocation allocation(cpuAddr, gpuAddr, 0u, allocSize, 0, false); + allocation.gmm = new Gmm(allocation.getUnderlyingBuffer(), allocation.getUnderlyingBufferSize(), false); + ASSERT_NE(nullptr, allocation.gmm); + allocation.gmm->isRenderCompressed = true; + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, &allocation, 0, type, false); + EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT, state->getCoherencyType()); + EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, state->getAuxiliarySurfaceMode()); + + delete allocation.gmm; + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenGmmCompressionDisabledAndAllocationEnabledAnNonAuxDisabledThenSetCoherencyToIaAndAuxModeToNone) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + + void *cpuAddr = reinterpret_cast(0x4000); + uint64_t gpuAddr = 0x4000u; + size_t allocSize = size; + GraphicsAllocation allocation(cpuAddr, gpuAddr, 0u, allocSize, 0, false); + allocation.gmm = new Gmm(allocation.getUnderlyingBuffer(), allocation.getUnderlyingBufferSize(), false); + ASSERT_NE(nullptr, allocation.gmm); + allocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED); + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, &allocation, 0, type, false); + EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT, state->getCoherencyType()); + EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, state->getAuxiliarySurfaceMode()); + + delete allocation.gmm; + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, givenCreatedSurfaceStateBufferWhenGmmAndAllocationCompressionEnabledAnNonAuxEnabledThenSetCoherencyToIaAndAuxModeToNone) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + RENDER_SURFACE_STATE *state = reinterpret_cast(stateBuffer); + + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + + void *cpuAddr = reinterpret_cast(0x4000); + uint64_t gpuAddr = 0x4000u; + size_t allocSize = size; + GraphicsAllocation allocation(cpuAddr, gpuAddr, 0u, allocSize, 0, false); + allocation.gmm = new Gmm(allocation.getUnderlyingBuffer(), allocation.getUnderlyingBufferSize(), false); + ASSERT_NE(nullptr, allocation.gmm); + allocation.gmm->isRenderCompressed = true; + allocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED); + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + helper.setRenderSurfaceStateForBuffer(ee, stateBuffer, size, addr, 0, pitch, &allocation, 0, type, true); + EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT, state->getCoherencyType()); + EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, state->getAuxiliarySurfaceMode()); + + delete allocation.gmm; + alignedFree(stateBuffer); +} + +HWTEST_F(HwHelperTest, DISABLED_profilingCreationOfRenderSurfaceStateVsMemcpyOfCachelineAlignedBuffer) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; + + constexpr uint32_t maxLoop = 1000u; + + std::vector> timesCreate; + timesCreate.reserve(maxLoop * 2); + + std::vector> timesMemCpy; + timesMemCpy.reserve(maxLoop * 2); + + std::vector nanoDurationCreate; + nanoDurationCreate.reserve(maxLoop); + + std::vector nanoDurationCpy; + nanoDurationCpy.reserve(maxLoop); + + std::vector surfaceStates; + surfaceStates.reserve(maxLoop); + + std::vector copyBuffers; + copyBuffers.reserve(maxLoop); + + for (uint32_t i = 0; i < maxLoop; ++i) { + void *stateBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, stateBuffer); + memset(stateBuffer, 0, sizeof(RENDER_SURFACE_STATE)); + surfaceStates.push_back(stateBuffer); + + void *copyBuffer = alignedMalloc(sizeof(RENDER_SURFACE_STATE), sizeof(RENDER_SURFACE_STATE)); + ASSERT_NE(nullptr, copyBuffer); + copyBuffers.push_back(copyBuffer); + } + + ExecutionEnvironment &ee = *pDevice->getExecutionEnvironment(); + auto &helper = HwHelper::get(renderCoreFamily); + + size_t size = 0x1000; + uint64_t addr = 0x2000; + uint32_t pitch = 0; + SURFACE_TYPE type = RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER; + + for (uint32_t i = 0; i < maxLoop; ++i) { + auto t1 = std::chrono::high_resolution_clock::now(); + helper.setRenderSurfaceStateForBuffer(ee, surfaceStates[i], size, addr, 0, pitch, nullptr, 0, type, true); + auto t2 = std::chrono::high_resolution_clock::now(); + timesCreate.push_back(t1); + timesCreate.push_back(t2); + } + + for (uint32_t i = 0; i < maxLoop; ++i) { + auto t1 = std::chrono::high_resolution_clock::now(); + memcpy_s(copyBuffers[i], sizeof(RENDER_SURFACE_STATE), surfaceStates[i], sizeof(RENDER_SURFACE_STATE)); + auto t2 = std::chrono::high_resolution_clock::now(); + timesMemCpy.push_back(t1); + timesMemCpy.push_back(t2); + } + + for (uint32_t i = 0; i < maxLoop; ++i) { + std::chrono::duration delta = timesCreate[i * 2 + 1] - timesCreate[i * 2]; + std::chrono::nanoseconds duration = std::chrono::duration_cast(delta); + nanoDurationCreate.push_back(duration.count()); + + delta = timesMemCpy[i * 2 + 1] - timesMemCpy[i * 2]; + duration = std::chrono::duration_cast(delta); + nanoDurationCpy.push_back(duration.count()); + } + + sort(nanoDurationCreate.begin(), nanoDurationCreate.end()); + sort(nanoDurationCpy.begin(), nanoDurationCpy.end()); + + double averageCreate = std::accumulate(nanoDurationCreate.begin(), nanoDurationCreate.end(), 0.0) / nanoDurationCreate.size(); + double averageCpy = std::accumulate(nanoDurationCpy.begin(), nanoDurationCpy.end(), 0.0) / nanoDurationCpy.size(); + + size_t middleCreate = nanoDurationCreate.size() / 2; + size_t middleCpy = nanoDurationCpy.size() / 2; + + std::cout << "Creation average: " << averageCreate << " median: " << nanoDurationCreate[middleCreate]; + std::cout << " min: " << nanoDurationCreate[0] << " max: " << nanoDurationCreate[nanoDurationCreate.size() - 1] << std::endl; + std::cout << "Copy average: " << averageCpy << " median: " << nanoDurationCpy[middleCpy]; + std::cout << " min: " << nanoDurationCpy[0] << " max: " << nanoDurationCpy[nanoDurationCpy.size() - 1] << std::endl; + + for (uint32_t i = 0; i < maxLoop; i++) { + std::cout << "#" << (i + 1) << " Create: " << nanoDurationCreate[i] << " Copy: " << nanoDurationCpy[i] << std::endl; + } + + for (uint32_t i = 0; i < maxLoop; ++i) { + alignedFree(surfaceStates[i]); + alignedFree(copyBuffers[i]); + } +} diff --git a/unit_tests/helpers/hw_helper_tests.h b/unit_tests/helpers/hw_helper_tests.h index a01379657f..20a5c8d77d 100644 --- a/unit_tests/helpers/hw_helper_tests.h +++ b/unit_tests/helpers/hw_helper_tests.h @@ -9,16 +9,19 @@ #include "runtime/device/device.h" #include "runtime/helpers/hw_helper.h" -#include "test.h" +#include "unit_tests/fixtures/device_fixture.h" #include "unit_tests/helpers/hw_info_helper.h" +#include "test.h" using namespace OCLRT; -class HwHelperTest : public testing::Test { +class HwHelperFixture : public DeviceFixture { protected: - void SetUp() override; - void TearDown() override; + void SetUp(); + void TearDown(); HwInfoHelper hwInfoHelper; }; +using HwHelperTest = Test; + void testDefaultImplementationOfSetupHardwareCapabilities(HwHelper &hwHelper, const HardwareInfo &hwInfo); diff --git a/unit_tests/kernel/kernel_tests.cpp b/unit_tests/kernel/kernel_tests.cpp index 80193cbecf..1cd6d5bb81 100644 --- a/unit_tests/kernel/kernel_tests.cpp +++ b/unit_tests/kernel/kernel_tests.cpp @@ -417,6 +417,41 @@ TEST_F(KernelFromBinaryTests, BuiltInIsSetToFalseForRegularKernels) { delete pKernel; pKernel = nullptr; + + pKernelInfo = pProgram->getKernelInfo("simple_kernel_6"); + + pKernel = Kernel::create( + pProgram, + *pKernelInfo, + &retVal); + + ASSERT_EQ(CL_SUCCESS, retVal); + ASSERT_NE(nullptr, pKernel); + + // get builtIn property + isBuiltIn = pKernel->isBuiltIn; + + EXPECT_FALSE(isBuiltIn); + + delete pKernel; + pKernel = nullptr; + pKernelInfo = pProgram->getKernelInfo("simple_kernel_7"); + + pKernel = Kernel::create( + pProgram, + *pKernelInfo, + &retVal); + + ASSERT_EQ(CL_SUCCESS, retVal); + ASSERT_NE(nullptr, pKernel); + + // get builtIn property + isBuiltIn = pKernel->isBuiltIn; + + EXPECT_FALSE(isBuiltIn); + + delete pKernel; + pKernel = nullptr; } TEST(PatchInfo, Constructor) { diff --git a/unit_tests/libult/mock_gfx_family.cpp b/unit_tests/libult/mock_gfx_family.cpp index 8d869fbfa9..e196585392 100644 --- a/unit_tests/libult/mock_gfx_family.cpp +++ b/unit_tests/libult/mock_gfx_family.cpp @@ -39,6 +39,7 @@ GENX::INTERFACE_DESCRIPTOR_DATA GENX::cmdInitInterfaceDescriptorData = GENX::INT GENX::MEDIA_STATE_FLUSH GENX::cmdInitMediaStateFlush = GENX::MEDIA_STATE_FLUSH::sInit(); GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD GENX::cmdInitMediaInterfaceDescriptorLoad = GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit(); GENX::MI_SEMAPHORE_WAIT GENX::cmdInitMiSemaphoreWait = GENX::MI_SEMAPHORE_WAIT::sInit(); +GENX::RENDER_SURFACE_STATE GENX::cmdRenderSurfaceState = GENX::RENDER_SURFACE_STATE::sInit(); bool GENX::enabledYTiling = true; template <> @@ -66,11 +67,6 @@ const AubMemDump::LrcaHelper &HwHelperHw::getCsTraits(EngineInstanceT engi return *AUBFamilyMapper::csTraits[engineInstance.type]; } -struct hw_helper_static_init { - hw_helper_static_init() { - hwHelperFactory[IGFX_UNKNOWN_CORE] = &HwHelperHw::get(); - } -}; template <> bool HwHelperHw::supportsYTiling() const { return GENX::enabledYTiling; @@ -78,6 +74,12 @@ bool HwHelperHw::supportsYTiling() const { template class HwHelperHw; +struct hw_helper_static_init { + hw_helper_static_init() { + hwHelperFactory[IGFX_UNKNOWN_CORE] = &HwHelperHw::get(); + } +}; + hw_helper_static_init si; template class GpgpuWalkerHelper; diff --git a/unit_tests/libult/mock_gfx_family.h b/unit_tests/libult/mock_gfx_family.h index e9661e3963..bb5bfd4d8b 100644 --- a/unit_tests/libult/mock_gfx_family.h +++ b/unit_tests/libult/mock_gfx_family.h @@ -287,6 +287,64 @@ struct GENX { } MI_BATCH_BUFFER_END; typedef struct tagRENDER_SURFACE_STATE { + typedef enum tagSURFACE_VERTICAL_ALIGNMENT { + SURFACE_VERTICAL_ALIGNMENT_VALIGN_4 = 0x1, + } SURFACE_VERTICAL_ALIGNMENT; + typedef enum tagSURFACE_HORIZONTAL_ALIGNMENT { + SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4 = 0x1, + } SURFACE_HORIZONTAL_ALIGNMENT; + typedef enum tagTILE_MODE { + TILE_MODE_LINEAR = 0x0, + } TILE_MODE; + typedef enum tagSURFACE_TYPE { + SURFACE_TYPE_SURFTYPE_BUFFER = 0x4, + SURFACE_TYPE_SURFTYPE_NULL = 0x7 + } SURFACE_TYPE; + typedef enum tagSURFACE_FORMAT { + SURFACE_FORMAT_RAW = 0x1ff, + } SURFACE_FORMAT; + typedef enum tagAUXILIARY_SURFACE_MODE { + AUXILIARY_SURFACE_MODE_AUX_NONE = 0x0, + AUXILIARY_SURFACE_MODE_AUX_CCS_E = 0x5, + } AUXILIARY_SURFACE_MODE; + typedef enum tagCOHERENCY_TYPE { + COHERENCY_TYPE_GPU_COHERENT = 0x0, + COHERENCY_TYPE_IA_COHERENT = 0x1, + } COHERENCY_TYPE; + static tagRENDER_SURFACE_STATE sInit(void) { + RENDER_SURFACE_STATE state; + return state; + } + inline void setWidth(const uint32_t value) { + } + inline void setHeight(const uint32_t value) { + } + inline void setDepth(const uint32_t value) { + } + inline void setSurfacePitch(const uint32_t value) { + } + inline void setSurfaceType(const SURFACE_TYPE value) { + } + inline void setSurfaceFormat(const SURFACE_FORMAT value) { + } + inline void setSurfaceVerticalAlignment(const SURFACE_VERTICAL_ALIGNMENT value) { + } + inline void setSurfaceHorizontalAlignment(const SURFACE_HORIZONTAL_ALIGNMENT value) { + } + inline void setTileMode(const TILE_MODE value) { + } + inline void setVerticalLineStride(const uint32_t value) { + } + inline void setVerticalLineStrideOffset(const uint32_t value) { + } + inline void setMemoryObjectControlState(const uint32_t value) { + } + inline void setSurfaceBaseAddress(const uint64_t value) { + } + inline void setCoherencyType(const COHERENCY_TYPE value) { + } + inline void setAuxiliarySurfaceMode(const AUXILIARY_SURFACE_MODE value) { + } } RENDER_SURFACE_STATE; typedef struct tagMEDIA_VFE_STATE { @@ -379,6 +437,7 @@ struct GENX { static MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad; static MI_SEMAPHORE_WAIT cmdInitMiSemaphoreWait; static PIPE_CONTROL cmdInitPipeControl; + static RENDER_SURFACE_STATE cmdRenderSurfaceState; }; template <> diff --git a/unit_tests/libult/ult_command_stream_receiver.h b/unit_tests/libult/ult_command_stream_receiver.h index 4b2df6a0eb..907f54bf82 100644 --- a/unit_tests/libult/ult_command_stream_receiver.h +++ b/unit_tests/libult/ult_command_stream_receiver.h @@ -7,7 +7,9 @@ #pragma once #include "runtime/command_stream/command_stream_receiver_hw.h" +#include "runtime/execution_environment/execution_environment.h" #include "runtime/memory_manager/os_agnostic_memory_manager.h" +#include "unit_tests/mocks/mock_experimental_command_buffer.h" #include #include @@ -20,7 +22,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass = CommandStreamReceiverHw; public: - using BaseClass::createScratchSpaceAllocation; + using BaseClass::deviceIndex; using BaseClass::dshState; using BaseClass::getScratchPatchAddress; using BaseClass::hwInfo; @@ -51,7 +53,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::requiredScratchSize; using BaseClass::CommandStreamReceiver::requiredThreadArbitrationPolicy; using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired; - using BaseClass::CommandStreamReceiver::scratchAllocation; + using BaseClass::CommandStreamReceiver::scratchSpaceController; using BaseClass::CommandStreamReceiver::stallingPipeControlOnNextFlushRequired; using BaseClass::CommandStreamReceiver::submissionAggregator; using BaseClass::CommandStreamReceiver::taskCount; diff --git a/unit_tests/main.cpp b/unit_tests/main.cpp index 62ded0c534..e9c4a8948d 100644 --- a/unit_tests/main.cpp +++ b/unit_tests/main.cpp @@ -18,6 +18,7 @@ #include "unit_tests/tests_configuration.h" #include "runtime/gmm_helper/resource_info.h" #include "runtime/os_interface/debug_settings_manager.h" +#include "runtime/os_interface/hw_info_config.h" #include "External/Common/GmmLibDllName.h" #include "mock_gmm_client_context.h" #include "gmock/gmock.h" @@ -185,9 +186,10 @@ int main(int argc, char **argv) { #endif ::testing::InitGoogleMock(&argc, argv); - + std::string hwInfoConfig = "default"; auto numDevices = numPlatformDevices; HardwareInfo device = DEFAULT_TEST_PLATFORM::hwInfo; + hardwareInfoSetup[device.pPlatform->eProductFamily](const_cast(device.pSysInfo), const_cast(device.pSkuTable), setupFeatureTable, hwInfoConfig); GT_SYSTEM_INFO gtSystemInfo = *device.pSysInfo; FeatureTable featureTable = *device.pSkuTable; @@ -195,7 +197,7 @@ int main(int argc, char **argv) { uint32_t euPerSubSlice = 0; uint32_t sliceCount = 0; uint32_t subSliceCount = 0; - int dieRecovery = 1; + int dieRecovery = 0; ::productFamily = device.pPlatform->eProductFamily; for (int i = 1; i < argc; ++i) { @@ -277,7 +279,7 @@ int main(int argc, char **argv) { return -1; } - uint32_t threadsPerEu = 7; + uint32_t threadsPerEu = hwInfoConfigFactory[productFamily]->threadsPerEu; PLATFORM platform; auto hardwareInfo = hardwareInfoTable[productFamily]; if (!hardwareInfo) { @@ -288,7 +290,6 @@ int main(int argc, char **argv) { platform.usRevId = (uint16_t)revisionId; // set Gt and FeatureTable to initial state - std::string hwInfoConfig = "default"; hardwareInfoSetup[productFamily](>SystemInfo, &featureTable, setupFeatureTable, hwInfoConfig); // and adjust dynamic values if not secified sliceCount = sliceCount > 0 ? sliceCount : gtSystemInfo.SliceCount; @@ -296,7 +297,7 @@ int main(int argc, char **argv) { euPerSubSlice = euPerSubSlice > 0 ? euPerSubSlice : gtSystemInfo.MaxEuPerSubSlice; // clang-format off gtSystemInfo.SliceCount = sliceCount; - gtSystemInfo.SubSliceCount = subSliceCount; + gtSystemInfo.SubSliceCount = gtSystemInfo.SliceCount * subSliceCount; gtSystemInfo.EUCount = gtSystemInfo.SubSliceCount * euPerSubSlice - dieRecovery; gtSystemInfo.ThreadCount = gtSystemInfo.EUCount * threadsPerEu; gtSystemInfo.MaxEuPerSubSlice = std::max(gtSystemInfo.MaxEuPerSubSlice, euPerSubSlice); diff --git a/unit_tests/mocks/mock_context.cpp b/unit_tests/mocks/mock_context.cpp index 2bc5f52733..75964dfc72 100644 --- a/unit_tests/mocks/mock_context.cpp +++ b/unit_tests/mocks/mock_context.cpp @@ -34,6 +34,7 @@ MockContext::MockContext(Device *device, bool noSpecialQueue) { MockContext::MockContext( void(CL_CALLBACK *funcNotify)(const char *, const void *, size_t, void *), void *data) { + device = nullptr; properties = nullptr; numProperties = 0; contextCallback = funcNotify; diff --git a/unit_tests/mocks/mock_kernel.h b/unit_tests/mocks/mock_kernel.h index 9c7f3c0811..f499fe4406 100644 --- a/unit_tests/mocks/mock_kernel.h +++ b/unit_tests/mocks/mock_kernel.h @@ -247,6 +247,7 @@ class MockKernelWithInternals { memset(&executionEnvironment, 0, sizeof(SPatchExecutionEnvironment)); memset(&executionEnvironmentBlock, 0, sizeof(SPatchExecutionEnvironment)); memset(&dataParameterStream, 0, sizeof(SPatchDataParameterStream)); + memset(&mediaVfeState, 0, sizeof(SPatchMediaVFEState)); executionEnvironment.NumGRFRequired = GrfConfig::DefaultGrfNumber; executionEnvironmentBlock.NumGRFRequired = GrfConfig::DefaultGrfNumber; kernelHeader.SurfaceStateHeapSize = sizeof(sshLocal); @@ -259,6 +260,7 @@ class MockKernelWithInternals { kernelInfo.patchInfo.dataParameterStream = &dataParameterStream; kernelInfo.patchInfo.executionEnvironment = &executionEnvironment; kernelInfo.patchInfo.threadPayload = &threadPayload; + kernelInfo.patchInfo.mediavfestate = &mediaVfeState; if (context == nullptr) { mockContext = new MockContext; @@ -287,9 +289,10 @@ class MockKernelWithInternals { MockProgram *mockProgram; Context *mockContext; KernelInfo kernelInfo; - SKernelBinaryHeaderCommon kernelHeader; - SPatchThreadPayload threadPayload; - SPatchDataParameterStream dataParameterStream; + SKernelBinaryHeaderCommon kernelHeader = {}; + SPatchThreadPayload threadPayload = {}; + SPatchMediaVFEState mediaVfeState = {}; + SPatchDataParameterStream dataParameterStream = {}; SPatchExecutionEnvironment executionEnvironment = {}; SPatchExecutionEnvironment executionEnvironmentBlock = {}; uint32_t kernelIsa[32]; diff --git a/unit_tests/test_files/simple_kernels.cl b/unit_tests/test_files/simple_kernels.cl index 3bcaad1f13..eb987d1ac5 100644 --- a/unit_tests/test_files/simple_kernels.cl +++ b/unit_tests/test_files/simple_kernels.cl @@ -48,3 +48,48 @@ __kernel void simple_kernel_4() { __kernel void simple_kernel_5(__global uint *dst) { atomic_inc(dst); } + +#define SIMPLE_KERNEL_6_ARRAY_SIZE 256 +__kernel void simple_kernel_6(__global uint *dst, __global uint2 *src, uint scalar, uint maxIterations, uint maxIterations2) { + __private uint2 array[SIMPLE_KERNEL_6_ARRAY_SIZE]; + __private uint2 sum; + __private size_t gid = get_global_id(0); + __private size_t lid = get_local_id(0); + + __private uint multi = 1; + if(lid == 1024) { + multi = 4; + } + sum = (uint2)(0, 0); + + for(int i = 0; i < maxIterations; ++i) { + array[i] = src[i] + (uint2)(i*multi, i*multi+scalar); + } + + for(int i = 0; i < maxIterations2; ++i) { + sum.x = array[i].x + sum.x; + sum.y = array[i].y + sum.y; + } + + vstore2(sum, gid, dst); +} + +typedef long16 TYPE; +__attribute__((reqd_work_group_size(32, 1, 1))) // force LWS to 32 +__attribute__((intel_reqd_sub_group_size(8))) // force SIMD to 8 +__kernel void simple_kernel_7(__global int *resIdx, global TYPE *src, global TYPE *dst){ + size_t lid = get_local_id(0); + size_t gid = get_global_id(0); + + TYPE res1 = src[gid*3]; + TYPE res2 = src[gid*3+1]; + TYPE res3 = src[gid*3+2]; + + __local TYPE locMem[32]; + locMem[lid] = res1; + barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); + TYPE res = (locMem[resIdx[gid]]*res3)*res2 + res1; + + dst[gid] = res; +}