From 5909a6b3d38db0517709b44ff926d0267afc8370 Mon Sep 17 00:00:00 2001 From: hjnapiat Date: Thu, 8 Feb 2018 16:00:20 +0100 Subject: [PATCH] Add support for GT-Pin Callbacks [3/n] Change-Id: Iea4b49efc9a666fde310ece15a9c69686d22f627 --- Jenkinsfile | 2 +- runtime/api/api.cpp | 13 +- runtime/command_queue/command_queue.cpp | 1 + runtime/command_queue/enqueue_common.h | 3 + .../command_stream_receiver.cpp | 2 + .../command_stream_receiver_hw.inl | 4 + runtime/context/context.cpp | 2 + runtime/gen8/gtpin_setup_gen8.cpp | 1 + runtime/gen9/gtpin_setup_gen9.cpp | 1 + runtime/gtpin/CMakeLists.txt | 2 + runtime/gtpin/gtpin_callback_stubs.cpp | 25 + runtime/gtpin/gtpin_callbacks.cpp | 180 ++- runtime/gtpin/gtpin_defs.h | 52 + runtime/gtpin/gtpin_hw_helper.h | 6 + runtime/gtpin/gtpin_hw_helper.inl | 74 + runtime/gtpin/gtpin_notify.h | 8 + runtime/helpers/base_object.h | 6 + runtime/helpers/kernel_commands.h | 10 +- runtime/helpers/kernel_commands.inl | 18 +- runtime/helpers/task_information.cpp | 11 +- runtime/kernel/kernel.cpp | 32 +- runtime/kernel/kernel.h | 10 +- runtime/platform/platform.cpp | 3 + runtime/program/kernel_info.h | 1 + runtime/utilities/CMakeLists.txt | 1 + runtime/utilities/spinlock.h | 40 + .../enqueue_execution_model_kernel_tests.cpp | 4 +- .../parent_kernel_dispatch_tests.cpp | 2 +- unit_tests/gtpin/gtpin_tests.cpp | 1433 ++++++++++++++++- unit_tests/helpers/base_object_tests.cpp | 14 + unit_tests/helpers/kernel_commands_tests.cpp | 11 +- unit_tests/utilities/CMakeLists.txt | 1 + unit_tests/utilities/spinlock_tests.cpp | 64 + 33 files changed, 1993 insertions(+), 44 deletions(-) create mode 100644 runtime/gtpin/gtpin_defs.h create mode 100644 runtime/gtpin/gtpin_hw_helper.inl create mode 100644 runtime/utilities/spinlock.h create mode 100644 unit_tests/utilities/spinlock_tests.cpp diff --git a/Jenkinsfile b/Jenkinsfile index a38b596b5b..1b8bdbaaa4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,4 +2,4 @@ neoDependenciesRev='735095-769' strategy='EQUAL' allowedF=43 -allowedCD=341 +allowedCD=340 diff --git a/runtime/api/api.cpp b/runtime/api/api.cpp index 1822a8b5ec..db7845c736 100644 --- a/runtime/api/api.cpp +++ b/runtime/api/api.cpp @@ -359,7 +359,6 @@ cl_int CL_API_CALL clReleaseContext(cl_context context) { Context *pContext = castToObject(context); if (pContext) { pContext->release(); - gtpinNotifyContextDestroy(context); return CL_SUCCESS; } @@ -1294,7 +1293,6 @@ cl_kernel CL_API_CALL clCreateKernel(cl_program clProgram, Program *pProgram = nullptr; cl_kernel kernel = nullptr; cl_int retVal = CL_SUCCESS; - DBG_LOG_INPUTS("clProgram", clProgram, "kernelName", kernelName); do { @@ -1350,6 +1348,9 @@ cl_int CL_API_CALL clCreateKernelsInProgram(cl_program clProgram, program, *kernelInfo, nullptr); + if (kernels[ordinal] != nullptr) { + gtpinNotifyKernelCreate(kernels[ordinal]); + } } } @@ -2559,6 +2560,11 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue, return retVal; } + auto pKernel = castToObjectOrAbort(kernel); + TakeOwnershipWrapper kernelOwnership(*pKernel, gtpinIsGTPinInitialized()); + + gtpinNotifyKernelSubmit(kernel, pCommandQueue); + retVal = pCommandQueue->enqueueKernel( kernel, workDim, @@ -3767,6 +3773,9 @@ cl_kernel CL_API_CALL clCloneKernel(cl_kernel sourceKernel, if (errcodeRet) { *errcodeRet = retVal; } + if (pClonedKernel != nullptr) { + gtpinNotifyKernelCreate(pClonedKernel); + } return pClonedKernel; } diff --git a/runtime/command_queue/command_queue.cpp b/runtime/command_queue/command_queue.cpp index 1f12fbaf5b..43cad7a661 100644 --- a/runtime/command_queue/command_queue.cpp +++ b/runtime/command_queue/command_queue.cpp @@ -28,6 +28,7 @@ #include "runtime/device_queue/device_queue.h" #include "runtime/event/event.h" #include "runtime/event/event_builder.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/array_count.h" #include "runtime/helpers/get_info.h" diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index ccc71a77ed..0031045c1e 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -27,6 +27,7 @@ #include "runtime/command_queue/dispatch_walker.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/event/event_builder.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/dispatch_info_builder.h" #include "runtime/mem_obj/buffer.h" @@ -549,6 +550,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); + gtpinNotifyPreFlushTask(this); + CompletionStamp completionStamp = commandStreamReceiver.flushTask( commandStream, commandStreamStart, diff --git a/runtime/command_stream/command_stream_receiver.cpp b/runtime/command_stream/command_stream_receiver.cpp index 8d252a1227..b52e011718 100644 --- a/runtime/command_stream/command_stream_receiver.cpp +++ b/runtime/command_stream/command_stream_receiver.cpp @@ -24,6 +24,7 @@ #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_stream/preemption.h" #include "runtime/device/device.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/memory_manager/memory_manager.h" #include "runtime/helpers/cache_policy.h" #include "runtime/os_interface/os_interface.h" @@ -192,6 +193,7 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int } } if (*getTagAddress() >= taskCountToWait) { + gtpinNotifyTaskCompletion(taskCountToWait); return true; } return false; diff --git a/runtime/command_stream/command_stream_receiver_hw.inl b/runtime/command_stream/command_stream_receiver_hw.inl index 43822b1355..7c4e6a1a2e 100644 --- a/runtime/command_stream/command_stream_receiver_hw.inl +++ b/runtime/command_stream/command_stream_receiver_hw.inl @@ -23,6 +23,7 @@ #include "runtime/command_stream/command_stream_receiver_hw.h" #include "runtime/command_stream/linear_stream.h" #include "runtime/device/device.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/cache_policy.h" #include "runtime/helpers/preamble.h" #include "runtime/helpers/ptr_math.h" @@ -377,6 +378,9 @@ CompletionStamp CommandStreamReceiverHw::flushTask( engineType}; this->taskLevel += levelClosed ? 1 : 0; + + gtpinNotifyFlushTask(completionStamp.taskCount); + return completionStamp; } diff --git a/runtime/context/context.cpp b/runtime/context/context.cpp index 3e06882d84..2f39febad7 100644 --- a/runtime/context/context.cpp +++ b/runtime/context/context.cpp @@ -25,6 +25,7 @@ #include "runtime/device/device.h" #include "runtime/device_queue/device_queue.h" #include "runtime/mem_obj/image.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/get_info.h" #include "runtime/helpers/ptr_math.h" #include "runtime/platform/platform.h" @@ -71,6 +72,7 @@ Context::~Context() { if (memoryManager && memoryManager->isAsyncDeleterEnabled()) { memoryManager->getDeferredDeleter()->removeClient(); } + gtpinNotifyContextDestroy((cl_context)this); } DeviceQueue *Context::getDefaultDeviceQueue() { diff --git a/runtime/gen8/gtpin_setup_gen8.cpp b/runtime/gen8/gtpin_setup_gen8.cpp index 77d59c3251..46fb239da9 100644 --- a/runtime/gen8/gtpin_setup_gen8.cpp +++ b/runtime/gen8/gtpin_setup_gen8.cpp @@ -22,6 +22,7 @@ #include "gtpin_ocl_interface.h" #include "runtime/gtpin/gtpin_hw_helper.h" +#include "runtime/gtpin/gtpin_hw_helper.inl" namespace OCLRT { diff --git a/runtime/gen9/gtpin_setup_gen9.cpp b/runtime/gen9/gtpin_setup_gen9.cpp index 2e50b259ba..3d0e78f0d8 100644 --- a/runtime/gen9/gtpin_setup_gen9.cpp +++ b/runtime/gen9/gtpin_setup_gen9.cpp @@ -22,6 +22,7 @@ #include "gtpin_ocl_interface.h" #include "runtime/gtpin/gtpin_hw_helper.h" +#include "runtime/gtpin/gtpin_hw_helper.inl" namespace OCLRT { diff --git a/runtime/gtpin/CMakeLists.txt b/runtime/gtpin/CMakeLists.txt index 5f94ec4ac1..aa1136ec72 100644 --- a/runtime/gtpin/CMakeLists.txt +++ b/runtime/gtpin/CMakeLists.txt @@ -26,9 +26,11 @@ if(GTPIN_HEADERS_DIR) ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_helpers.h ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.h + ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.inl ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_init.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_init.h ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_notify.h + ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_defs.h PARENT_SCOPE ) else() diff --git a/runtime/gtpin/gtpin_callback_stubs.cpp b/runtime/gtpin/gtpin_callback_stubs.cpp index 7776a9ef11..6689c0bab7 100644 --- a/runtime/gtpin/gtpin_callback_stubs.cpp +++ b/runtime/gtpin/gtpin_callback_stubs.cpp @@ -32,4 +32,29 @@ void gtpinNotifyContextDestroy(cl_context context) { void gtpinNotifyKernelCreate(cl_kernel kernel) { } + +void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) { +} + +void gtpinNotifyPreFlushTask(void *pCmdQueue) { +} + +void gtpinNotifyFlushTask(uint32_t flushedTaskCount) { +} + +void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) { +} + +void gtpinNotifyMakeResident(void *pKernel, void *pCommandStreamReceiver) { +} + +void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResidencyVector) { +} + +void gtpinNotifyPlatformShutdown() { +} + +bool gtpinIsGTPinInitialized() { + return false; +} } diff --git a/runtime/gtpin/gtpin_callbacks.cpp b/runtime/gtpin/gtpin_callbacks.cpp index bd2d40938c..f5d80c64db 100644 --- a/runtime/gtpin/gtpin_callbacks.cpp +++ b/runtime/gtpin/gtpin_callbacks.cpp @@ -23,20 +23,33 @@ #include "config.h" #include "gtpin_ocl_interface.h" #include "CL/cl.h" +#include "runtime/command_queue/command_queue.h" +#include "runtime/command_stream/command_stream_receiver.h" #include "runtime/context/context.h" #include "runtime/device/device.h" #include "runtime/device/device_info.h" +#include "runtime/gtpin/gtpin_defs.h" #include "runtime/gtpin/gtpin_hw_helper.h" #include "runtime/kernel/kernel.h" +#include "runtime/mem_obj/buffer.h" +#include "runtime/memory_manager/surface.h" #include "runtime/platform/platform.h" +#include "runtime/utilities/spinlock.h" +#include +#include using namespace gtpin; namespace OCLRT { + extern bool isGTPinInitialized; extern gtpin::ocl::gtpin_events_t GTPinCallbacks; igc_init_t *pIgcInfo = nullptr; +std::atomic sequenceCount(1); +CommandQueue *pCmdQueueForFlushTask = nullptr; +std::deque kernelExecQueue; +std::atomic_flag kernelExecQueueLock = ATOMIC_FLAG_INIT; void gtpinNotifyContextCreate(cl_context context) { if (isGTPinInitialized) { @@ -47,7 +60,6 @@ void gtpinNotifyContextCreate(cl_context context) { GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily); gtpinPlatformInfo.gen_version = (gtpin::GTPIN_GEN_VERSION)gtpinHelper.getGenVersion(); gtpinPlatformInfo.device_id = static_cast(pDevice->getHardwareInfo().pPlatform->usDeviceID); - (*GTPinCallbacks.onContextCreate)((context_handle_t)context, >pinPlatformInfo, &pIgcInfo); } } @@ -60,24 +72,178 @@ void gtpinNotifyContextDestroy(cl_context context) { void gtpinNotifyKernelCreate(cl_kernel kernel) { if (isGTPinInitialized) { - auto pKernel = castToObject(kernel); + auto pKernel = castToObjectOrAbort(kernel); + size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(); + // Enlarge local copy of SSH by 1 SS + auto pPlatform = platform(); + auto pDevice = pPlatform->getDevice(0); + GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily); + if (!gtpinHelper.addSurfaceState(pKernel)) { + // Kernel with no SSH or Kernel EM, not supported + return; + } + if (pKernel->isKernelHeapSubstituted()) { + // ISA for this kernel was already substituted + return; + } + // Notify GT-Pin that new kernel was created Context *pContext = &(pKernel->getContext()); cl_context context = (cl_context)pContext; - const KernelInfo &kInfo = pKernel->getKernelInfo(); + auto &kernelInfo = pKernel->getKernelInfo(); instrument_params_in_t paramsIn; paramsIn.kernel_type = GTPIN_KERNEL_TYPE_CS; - paramsIn.simd = (GTPIN_SIMD_WIDTH)kInfo.getMaxSimdSize(); + paramsIn.simd = (GTPIN_SIMD_WIDTH)kernelInfo.getMaxSimdSize(); paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap(); paramsIn.orig_kernel_size = static_cast(pKernel->getKernelHeapSize()); paramsIn.buffer_type = GTPIN_BUFFER_BINDFULL; - paramsIn.buffer_desc.BTI = kInfo.patchInfo.bindingTableState->Count; - paramsIn.igc_hash_id = kInfo.heapInfo.pKernelHeader->ShaderHashCode; - paramsIn.kernel_name = (char *)kInfo.name.c_str(); + paramsIn.buffer_desc.BTI = static_cast(gtpinBTI); + paramsIn.igc_hash_id = kernelInfo.heapInfo.pKernelHeader->ShaderHashCode; + paramsIn.kernel_name = (char *)kernelInfo.name.c_str(); paramsIn.igc_info = nullptr; instrument_params_out_t paramsOut = {0}; (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, ¶msIn, ¶msOut); + // Substitute ISA of created kernel with instrumented code pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size); pKernel->setKernelId(paramsOut.kernel_id); } } + +void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) { + if (isGTPinInitialized) { + auto pKernel = castToObjectOrAbort(kernel); + if (pKernel->getSurfaceStateHeapSize() == 0) { + // Kernel with no SSH, not supported + return; + } + Context *pContext = &(pKernel->getContext()); + cl_context context = (cl_context)pContext; + uint64_t kernelId = pKernel->getKernelId(); + command_buffer_handle_t commandBuffer = (command_buffer_handle_t)((uintptr_t)(sequenceCount++)); + uint32_t kernelOffset = 0; + resource_handle_t resource = 0; + // Notify GT-Pin that abstract "command buffer" was created + (*GTPinCallbacks.onCommandBufferCreate)((context_handle_t)context, commandBuffer); + // Notify GT-Pin that kernel was submited for execution + (*GTPinCallbacks.onKernelSubmit)(commandBuffer, kernelId, &kernelOffset, &resource); + // Create new record in Kernel Execution Queue describing submited kernel + gtpinkexec_t kExec; + kExec.pKernel = pKernel; + kExec.gtpinResource = (cl_mem)resource; + kExec.commandBuffer = commandBuffer; + kExec.pCommandQueue = (CommandQueue *)pCmdQueue; + SpinLock lock; + lock.enter(kernelExecQueueLock); + kernelExecQueue.push_back(kExec); + lock.leave(kernelExecQueueLock); + // Patch SSH[gtpinBTI] with GT-Pin resource + auto pPlatform = platform(); + auto pDevice = pPlatform->getDevice(0); + GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily); + size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1; + void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI); + cl_mem buffer = (cl_mem)resource; + auto pBuffer = castToObjectOrAbort(buffer); + pBuffer->setArgStateful(const_cast(pSurfaceState)); + } +} + +void gtpinNotifyPreFlushTask(void *pCmdQueue) { + if (isGTPinInitialized) { + pCmdQueueForFlushTask = (CommandQueue *)pCmdQueue; + } +} + +void gtpinNotifyFlushTask(uint32_t flushedTaskCount) { + if (isGTPinInitialized) { + SpinLock lock; + lock.enter(kernelExecQueueLock); + size_t numElems = kernelExecQueue.size(); + for (size_t n = 0; n < numElems; n++) { + if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) { + // Update record in Kernel Execution Queue with kernel's TC + kernelExecQueue[n].isTaskCountValid = true; + kernelExecQueue[n].taskCount = flushedTaskCount; + break; + } + } + lock.leave(kernelExecQueueLock); + pCmdQueueForFlushTask = nullptr; + } +} + +void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) { + if (isGTPinInitialized) { + SpinLock lock; + lock.enter(kernelExecQueueLock); + size_t numElems = kernelExecQueue.size(); + for (size_t n = 0; n < numElems;) { + if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) { + // Notify GT-Pin that execution of "command buffer" was completed + (*GTPinCallbacks.onCommandBufferComplete)(kernelExecQueue[n].commandBuffer); + // Remove kernel's record from Kernel Execution Queue + kernelExecQueue.erase(kernelExecQueue.begin() + n); + numElems--; + } else { + n++; + } + } + lock.leave(kernelExecQueueLock); + } +} + +void gtpinNotifyMakeResident(void *pKernel, void *pCSR) { + if (isGTPinInitialized) { + SpinLock lock; + lock.enter(kernelExecQueueLock); + size_t numElems = kernelExecQueue.size(); + for (size_t n = 0; n < numElems; n++) { + if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident) { + // It's time for kernel to make resident its GT-Pin resource + CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast(pCSR); + cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource; + auto pBuffer = castToObjectOrAbort(gtpinBuffer); + GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(); + pCommandStreamReceiver->makeResident(*pGfxAlloc); + kernelExecQueue[n].isResourceResident = true; + break; + } + } + lock.leave(kernelExecQueueLock); + } +} + +void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) { + if (isGTPinInitialized) { + SpinLock lock; + lock.enter(kernelExecQueueLock); + size_t numElems = kernelExecQueue.size(); + for (size_t n = 0; n < numElems; n++) { + if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident) { + // It's time for kernel to update its residency list with its GT-Pin resource + std::vector *pResidencyVector = (std::vector *)pResVec; + cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource; + auto pBuffer = castToObjectOrAbort(gtpinBuffer); + GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(); + GeneralSurface *pSurface = new GeneralSurface(pGfxAlloc); + pResidencyVector->push_back(pSurface); + kernelExecQueue[n].isResourceResident = true; + break; + } + } + lock.leave(kernelExecQueueLock); + } +} + +void gtpinNotifyPlatformShutdown() { + if (isGTPinInitialized) { + // Clear Kernel Execution Queue + kernelExecQueue.clear(); + } +} + +bool gtpinIsGTPinInitialized() { + return isGTPinInitialized; +} } diff --git a/runtime/gtpin/gtpin_defs.h b/runtime/gtpin/gtpin_defs.h new file mode 100644 index 0000000000..bd4305ca21 --- /dev/null +++ b/runtime/gtpin/gtpin_defs.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "config.h" +#include "gtpin_ocl_interface.h" +#include "CL/cl.h" +#include "runtime/command_queue/command_queue.h" +#include "runtime/kernel/kernel.h" + +namespace OCLRT { + +struct GTPinKernelExec { + Kernel *pKernel; + cl_mem gtpinResource; + CommandQueue *pCommandQueue; + gtpin::command_buffer_handle_t commandBuffer; + uint32_t taskCount; + bool isTaskCountValid; + bool isResourceResident; + + GTPinKernelExec() { + pKernel = nullptr; + gtpinResource = nullptr; + pCommandQueue = nullptr; + commandBuffer = nullptr; + taskCount = 0; + isTaskCountValid = false; + isResourceResident = false; + } +}; +typedef struct GTPinKernelExec gtpinkexec_t; + +} // OCLRT diff --git a/runtime/gtpin/gtpin_hw_helper.h b/runtime/gtpin/gtpin_hw_helper.h index 752003d1cc..f42c15c7a4 100644 --- a/runtime/gtpin/gtpin_hw_helper.h +++ b/runtime/gtpin/gtpin_hw_helper.h @@ -24,10 +24,14 @@ #include "runtime/gen_common/hw_cmds.h" namespace OCLRT { +class Kernel; + class GTPinHwHelper { public: static GTPinHwHelper &get(GFXCORE_FAMILY gfxCore); virtual uint32_t getGenVersion() = 0; + virtual bool addSurfaceState(Kernel *pKernel) = 0; + virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0; protected: GTPinHwHelper(){}; @@ -41,6 +45,8 @@ class GTPinHwHelperHw : public GTPinHwHelper { return gtpinHwHelper; } uint32_t getGenVersion() override; + bool addSurfaceState(Kernel *pKernel) override; + void *getSurfaceState(Kernel *pKernel, size_t bti) override; private: GTPinHwHelperHw(){}; diff --git a/runtime/gtpin/gtpin_hw_helper.inl b/runtime/gtpin/gtpin_hw_helper.inl new file mode 100644 index 0000000000..91d4c92802 --- /dev/null +++ b/runtime/gtpin/gtpin_hw_helper.inl @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "hw_cmds.h" +#include "runtime/gtpin/gtpin_hw_helper.h" +#include "runtime/helpers/string.h" +#include "runtime/kernel/kernel.h" + +namespace OCLRT { + +template +bool GTPinHwHelperHw::addSurfaceState(Kernel *pKernel) { + using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; + using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; + + size_t sshSize = pKernel->getSurfaceStateHeapSize(); + if ((sshSize == 0) || pKernel->isParentKernel) { + // Kernels which do not use SSH or use Execution Model are not supported (yet) + return false; + } + size_t ssSize = sizeof(RENDER_SURFACE_STATE); + size_t btsSize = sizeof(BINDING_TABLE_STATE); + size_t sizeToEnlarge = ssSize + btsSize; + size_t currBTOffset = pKernel->getBindingTableOffset(); + size_t currSurfaceStateSize = currBTOffset; + char *pSsh = reinterpret_cast(pKernel->getSurfaceStateHeap()); + char *pNewSsh = new char[sshSize + sizeToEnlarge]; + memcpy_s(pNewSsh, sshSize + sizeToEnlarge, pSsh, currSurfaceStateSize); + RENDER_SURFACE_STATE *pSS = reinterpret_cast(pNewSsh + currSurfaceStateSize); + pSS->init(); + size_t newSurfaceStateSize = currSurfaceStateSize + ssSize; + size_t currBTCount = pKernel->getNumberOfBindingTableStates(); + memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize); + BINDING_TABLE_STATE *pNewBTS = reinterpret_cast(pNewSsh + newSurfaceStateSize + currBTCount * btsSize); + BINDING_TABLE_STATE bti; + bti.init(); + bti.setSurfaceStatePointer((uint64_t)currBTOffset); + *pNewBTS = bti; + pKernel->resizeSurfaceStateHeap(pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize); + return true; +} + +template +void *GTPinHwHelperHw::getSurfaceState(Kernel *pKernel, size_t bti) { + using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; + + if ((nullptr == pKernel->getSurfaceStateHeap()) || (bti >= pKernel->getNumberOfBindingTableStates())) { + return nullptr; + } + auto *pBts = reinterpret_cast(ptrOffset(pKernel->getSurfaceStateHeap(), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE)))); + auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(), pBts->getSurfaceStatePointer()); + return pSurfaceState; +} + +} // namespace OCLRT diff --git a/runtime/gtpin/gtpin_notify.h b/runtime/gtpin/gtpin_notify.h index b12e85df9a..42d6e158ea 100644 --- a/runtime/gtpin/gtpin_notify.h +++ b/runtime/gtpin/gtpin_notify.h @@ -26,4 +26,12 @@ namespace OCLRT { void gtpinNotifyContextCreate(cl_context context); void gtpinNotifyContextDestroy(cl_context context); void gtpinNotifyKernelCreate(cl_kernel kernel); +void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue); +void gtpinNotifyPreFlushTask(void *pCmdQueue); +void gtpinNotifyFlushTask(uint32_t flushedTaskCount); +void gtpinNotifyTaskCompletion(uint32_t completedTaskCount); +void gtpinNotifyMakeResident(void *pKernel, void *pCommandStreamReceiver); +void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResidencyVector); +void gtpinNotifyPlatformShutdown(); +bool gtpinIsGTPinInitialized(); } diff --git a/runtime/helpers/base_object.h b/runtime/helpers/base_object.h index fbcc3db799..f6ac1bbc36 100644 --- a/runtime/helpers/base_object.h +++ b/runtime/helpers/base_object.h @@ -107,6 +107,12 @@ class TakeOwnershipWrapper { : obj(obj) { this->locked = obj.takeOwnership(true); } + TakeOwnershipWrapper(T &obj, bool lockImmediately) + : obj(obj) { + if (lockImmediately) { + this->locked = obj.takeOwnership(true); + } + } ~TakeOwnershipWrapper() { if (locked) { obj.releaseOwnership(); diff --git a/runtime/helpers/kernel_commands.h b/runtime/helpers/kernel_commands.h index 0d6aa302b9..eb6fb0c783 100644 --- a/runtime/helpers/kernel_commands.h +++ b/runtime/helpers/kernel_commands.h @@ -76,16 +76,20 @@ struct KernelCommandsHelper : public PerThreadDataHelper { const Kernel &kernel); static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo, - const void *srcKernelSsh, size_t srcKernelSshSize); + const void *srcKernelSsh, size_t srcKernelSshSize, + size_t numberOfBindingTableStates, size_t offsetOfBindingTable); static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo) { return pushBindingTableAndSurfaceStates(dstHeap, srcKernelInfo, srcKernelInfo.heapInfo.pSsh, - srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize); + srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize, + (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0, + (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Offset : 0); } static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) { return pushBindingTableAndSurfaceStates(dstHeap, srcKernel.getKernelInfo(), - srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize()); + srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(), + srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset()); } static size_t sendIndirectState( diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl index 477eb58c6d..fae1353a98 100644 --- a/runtime/helpers/kernel_commands.inl +++ b/runtime/helpers/kernel_commands.inl @@ -265,7 +265,8 @@ size_t KernelCommandsHelper::sendCrossThreadData( // as required by the INTERFACE_DESCRIPTOR_DATA. template size_t KernelCommandsHelper::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo, - const void *srcKernelSsh, size_t srcKernelSshSize) { + const void *srcKernelSsh, size_t srcKernelSshSize, + size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; @@ -274,9 +275,8 @@ size_t KernelCommandsHelper::pushBindingTableAndSurfaceStates(Indirec // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch return 0; } - size_t sshSize = srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize; - DEBUG_BREAK_IF(!((sshSize <= srcKernelSshSize) && (srcKernelSsh != nullptr))); - uint32_t localBtiOffset = srcKernelInfo.patchInfo.bindingTableState->Offset; + size_t sshSize = srcKernelSshSize; + DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; // Align the heap and allocate space for new ssh data @@ -289,21 +289,21 @@ size_t KernelCommandsHelper::pushBindingTableAndSurfaceStates(Indirec // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) // we need to simply copy the ssh (including BTIs from compiler) memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); - return localBtiOffset; + return offsetOfBindingTable; } // We can copy-over the surface states, but BTIs will need to be patched - memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, localBtiOffset); + memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getBase())); // march over BTIs and offset the pointers based on surface state base address - auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, localBtiOffset)); + auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); - auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, localBtiOffset)); + auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); BINDING_TABLE_STATE bti; bti.init(); // init whole DWORD - i.e. not just the SurfaceStatePointer bits - for (uint32_t i = 0, e = srcKernelInfo.patchInfo.bindingTableState->Count; i != e; ++i) { + for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) { uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index 561da00e33..0eb63e2bbe 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -26,6 +26,7 @@ #include "runtime/command_queue/enqueue_common.h" #include "runtime/device/device.h" #include "runtime/device_queue/device_queue.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/mem_obj/image.h" #include "runtime/memory_manager/surface.h" #include "runtime/helpers/aligned_memory.h" @@ -76,6 +77,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); + gtpinNotifyPreFlushTask(&cmdQ); + completionStamp = csr.flushTask(queueCommandStream, offset, cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE), @@ -267,6 +270,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); + gtpinNotifyPreFlushTask(&commandQueue); + completionStamp = commandStreamReceiver.flushTask(queueCommandStream, offset, *dsh, @@ -275,12 +280,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate ssh, taskLevel, dispatchFlags); - - commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp); - for (auto &surface : surfaces) { surface->setCompletionStamp(completionStamp, nullptr, nullptr); } + commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp); if (printfHandler) { printfHandler.get()->printEnqueueOutput(); @@ -309,6 +312,8 @@ CompletionStamp &CommandMarker::submit(uint32_t taskLevel, bool terminated) { DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); + gtpinNotifyPreFlushTask(&cmdQ); + completionStamp = csr.flushTask(queueCommandStream, offset, cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE), diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp index 96e8e2e29d..2b2db124c8 100644 --- a/runtime/kernel/kernel.cpp +++ b/runtime/kernel/kernel.cpp @@ -30,6 +30,7 @@ #include "runtime/helpers/surface_formats.h" #include "runtime/device_queue/device_queue.h" #include "runtime/execution_model/device_enqueue.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/basic_math.h" #include "runtime/helpers/debug_helpers.h" @@ -94,6 +95,8 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, const Devic context(nullptr), device(deviceArg), kernelInfo(kernelInfoArg), + numberOfBindingTableStates(0), + localBindingTableOffset(0), pSshLocal(nullptr), sshLocalSize(0), crossThreadData(nullptr), @@ -245,6 +248,8 @@ cl_int Kernel::initialize() { // copy the ssh into our local copy memcpy_s(pSshLocal, sshLocalSize, heapInfo.pSsh, sshLocalSize); } + numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0; + localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0; // patch crossthread data and ssh with inline surfaces, if necessary privateSurfaceSize = patchInfo.pAllocateStatelessPrivateSurface @@ -690,6 +695,11 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) *pKernelHeap = newKernelHeap; SKernelBinaryHeaderCommon *pHeader = const_cast(pKernelInfo->heapInfo.pKernelHeader); pHeader->KernelHeapSize = static_cast(newKernelHeapSize); + pKernelInfo->isKernelHeapSubstituted = true; +} + +bool Kernel::isKernelHeapSubstituted() const { + return kernelInfo.isKernelHeapSubstituted; } uint64_t Kernel::getKernelId() const { @@ -721,16 +731,20 @@ const void *Kernel::getDynamicStateHeap() const { size_t Kernel::getSurfaceStateHeapSize() const { return kernelInfo.usesSsh - ? kernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize + ? sshLocalSize : 0; } -size_t Kernel::getNumberOfSurfaceStates() const { - const auto &patchInfo = kernelInfo.patchInfo; - if (patchInfo.bindingTableState == nullptr) { - return 0; - } - return patchInfo.bindingTableState->Count; +size_t Kernel::getNumberOfBindingTableStates() const { + return numberOfBindingTableStates; +} + +void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) { + delete[] pSshLocal; + pSshLocal = reinterpret_cast(pNewSsh); + sshLocalSize = static_cast(newSshSize); + numberOfBindingTableStates = newBindingTableCount; + localBindingTableOffset = newBindingTableOffset; } uint32_t Kernel::getScratchSizeValueToProgramMediaVfeState(int scratchSize) { @@ -927,6 +941,8 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) { } makeArgsResident(commandStreamReceiver); + + gtpinNotifyMakeResident(this, &commandStreamReceiver); } void Kernel::getResidency(std::vector &dst) { @@ -964,6 +980,8 @@ void Kernel::getResidency(std::vector &dst) { } } } + + gtpinNotifyUpdateResidencyList(this, &dst); } bool Kernel::requiresCoherency() { diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index 175ed00635..36e33bbd3e 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -156,9 +156,15 @@ class Kernel : public BaseObject<_cl_kernel> { size_t getKernelHeapSize() const; size_t getSurfaceStateHeapSize() const; size_t getDynamicStateHeapSize() const; - size_t getNumberOfSurfaceStates() const; + size_t getNumberOfBindingTableStates() const; + size_t getBindingTableOffset() const { + return localBindingTableOffset; + } + + void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset); void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize); + bool isKernelHeapSubstituted() const; uint64_t getKernelId() const; void setKernelId(uint64_t newKernelId); @@ -434,6 +440,8 @@ class Kernel : public BaseObject<_cl_kernel> { std::vector kernelArgHandlers; std::vector kernelSvmGfxAllocations; + size_t numberOfBindingTableStates; + size_t localBindingTableOffset; char *pSshLocal; uint32_t sshLocalSize; diff --git a/runtime/platform/platform.cpp b/runtime/platform/platform.cpp index cc2437a6f6..acd55c6f69 100644 --- a/runtime/platform/platform.cpp +++ b/runtime/platform/platform.cpp @@ -25,6 +25,7 @@ #include "runtime/compiler_interface/compiler_interface.h" #include "CL/cl_ext.h" #include "runtime/device/device.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/debug_helpers.h" #include "runtime/helpers/get_info.h" #include "runtime/helpers/options.h" @@ -191,6 +192,8 @@ void Platform::shutdown() { DeviceFactory::releaseDevices(); std::string().swap(compilerExtensions); + + gtpinNotifyPlatformShutdown(); } Device *Platform::getDevice(size_t deviceOrdinal) { diff --git a/runtime/program/kernel_info.h b/runtime/program/kernel_info.h index 8b5b747cd8..a806e3d6aa 100644 --- a/runtime/program/kernel_info.h +++ b/runtime/program/kernel_info.h @@ -236,5 +236,6 @@ struct KernelInfo { uint32_t argumentsToPatchNum = 0; uint32_t systemKernelOffset = 0; uint64_t kernelId = 0; + bool isKernelHeapSubstituted = false; }; } // namespace OCLRT diff --git a/runtime/utilities/CMakeLists.txt b/runtime/utilities/CMakeLists.txt index 5e4e70ea82..8e384eb391 100644 --- a/runtime/utilities/CMakeLists.txt +++ b/runtime/utilities/CMakeLists.txt @@ -35,6 +35,7 @@ set(RUNTIME_SRCS_UTILITIES_BASE ${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.h ${CMAKE_CURRENT_SOURCE_DIR}/reference_tracked_object.h + ${CMAKE_CURRENT_SOURCE_DIR}/spinlock.h ${CMAKE_CURRENT_SOURCE_DIR}/stackvec.h ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.h ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator_base.h diff --git a/runtime/utilities/spinlock.h b/runtime/utilities/spinlock.h new file mode 100644 index 0000000000..88259559b8 --- /dev/null +++ b/runtime/utilities/spinlock.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include + +namespace OCLRT { + +class SpinLock { + public: + void enter(std::atomic_flag &spinLock) { + while (spinLock.test_and_set(std::memory_order_acquire)) { + }; + } + void leave(std::atomic_flag &spinLock) { + spinLock.clear(std::memory_order_release); + } +}; + +} // OCLRT diff --git a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp index 3fec6156e7..b80f3cf93a 100644 --- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp +++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp @@ -214,7 +214,7 @@ HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenBlocksSurface Kernel *blockKernel = Kernel::create(pKernel->getProgram(), *pBlockInfo, nullptr); blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); - if (blockKernel->getNumberOfSurfaceStates() > 0) { + if (blockKernel->getNumberOfBindingTableStates() > 0) { ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState); auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset); EXPECT_EQ(0U, reinterpret_cast(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE); @@ -222,7 +222,7 @@ HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenBlocksSurface auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset); auto srcBindingTable = reinterpret_cast(srcBlockBti); - for (uint32_t i = 0; i < blockKernel->getNumberOfSurfaceStates(); ++i) { + for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) { uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer(); uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer(); auto *dstSurfaceState = reinterpret_cast(ptrOffset(ssh->getBase(), dstSurfaceStatePointer)); diff --git a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp index 92ef47de60..36b3a56d39 100644 --- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp +++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp @@ -188,7 +188,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz size_t sshUsed = blockedCommandsData->ssh->getUsed(); - size_t expectedSizeSSH = pKernel->getNumberOfSurfaceStates() * sizeof(RENDER_SURFACE_STATE) + pKernel->getKernelInfo().patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE); + size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) + pKernel->getKernelInfo().patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE); if ((pKernel->requiresSshForBuffers()) || (pKernel->getKernelInfo().patchInfo.imageMemObjKernelArgs.size() > 0)) { EXPECT_EQ(expectedSizeSSH, sshUsed); diff --git a/unit_tests/gtpin/gtpin_tests.cpp b/unit_tests/gtpin/gtpin_tests.cpp index 17df82d2dc..4e58524611 100644 --- a/unit_tests/gtpin/gtpin_tests.cpp +++ b/unit_tests/gtpin/gtpin_tests.cpp @@ -23,13 +23,18 @@ #include "config.h" #include "runtime/context/context.h" #include "runtime/device/device.h" +#include "runtime/gtpin/gtpin_defs.h" #include "runtime/gtpin/gtpin_init.h" #include "runtime/gtpin/gtpin_helpers.h" +#include "runtime/gtpin/gtpin_hw_helper.h" +#include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/basic_math.h" #include "runtime/helpers/file_io.h" +#include "runtime/helpers/hash.h" #include "runtime/helpers/options.h" #include "runtime/kernel/kernel.h" #include "runtime/mem_obj/buffer.h" +#include "runtime/memory_manager/surface.h" #include "unit_tests/fixtures/context_fixture.h" #include "unit_tests/fixtures/memory_management_fixture.h" #include "unit_tests/fixtures/platform_fixture.h" @@ -37,12 +42,15 @@ #include "unit_tests/helpers/test_files.h" #include "test.h" #include "gtest/gtest.h" +#include +#include using namespace OCLRT; using namespace gtpin; namespace OCLRT { extern bool isGTPinInitialized; +extern std::deque kernelExecQueue; } namespace ULT { @@ -50,12 +58,24 @@ namespace ULT { int ContextCreateCallbackCount = 0; int ContextDestroyCallbackCount = 0; int KernelCreateCallbackCount = 0; +int KernelSubmitCallbackCount = 0; +int CommandBufferCreateCallbackCount = 0; +int CommandBufferCompleteCallbackCount = 0; + +context_handle_t currContext = nullptr; + +std::deque kernelResources; void OnContextCreate(context_handle_t context, platform_info_t *platformInfo, igc_init_t **igcInit) { + currContext = context; + kernelResources.clear(); ContextCreateCallbackCount++; } void OnContextDestroy(context_handle_t context) { + currContext = nullptr; + EXPECT_EQ(0u, kernelResources.size()); + kernelResources.clear(); ContextDestroyCallbackCount++; } @@ -67,12 +87,40 @@ void OnKernelCreate(context_handle_t context, const instrument_params_in_t *para } void OnKernelSubmit(command_buffer_handle_t cb, uint64_t kernelId, uint32_t *entryOffset, resource_handle_t *resource) { + resource_handle_t currResource = nullptr; + ASSERT_NE(nullptr, currContext); + GTPIN_DI_STATUS st = gtpinCreateBuffer(currContext, (uint32_t)256, &currResource); + EXPECT_EQ(GTPIN_DI_SUCCESS, st); + EXPECT_NE(nullptr, currResource); + + uint8_t *bufAddress = nullptr; + st = gtpinMapBuffer(currContext, currResource, &bufAddress); + EXPECT_EQ(GTPIN_DI_SUCCESS, st); + EXPECT_NE(nullptr, bufAddress); + + *entryOffset = 0; + *resource = currResource; + kernelResources.push_back(currResource); + + KernelSubmitCallbackCount++; } void OnCommandBufferCreate(context_handle_t context, command_buffer_handle_t cb) { + CommandBufferCreateCallbackCount++; } void OnCommandBufferComplete(command_buffer_handle_t cb) { + ASSERT_NE(nullptr, currContext); + resource_handle_t currResource = kernelResources[0]; + EXPECT_NE(nullptr, currResource); + GTPIN_DI_STATUS st = gtpinUnmapBuffer(currContext, currResource); + EXPECT_EQ(GTPIN_DI_SUCCESS, st); + + st = gtpinFreeBuffer(currContext, currResource); + EXPECT_EQ(GTPIN_DI_SUCCESS, st); + kernelResources.pop_front(); + + CommandBufferCompleteCallbackCount++; } class GTPinFixture : public ContextFixture, public MemoryManagementFixture { @@ -124,14 +172,22 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture { typedef Test GTPinTests; TEST_F(GTPinTests, givenInvalidArgumentsThenGTPinInitFails) { + bool isInitialized = false; + retFromGtPin = GTPin_Init(nullptr, nullptr, nullptr); EXPECT_EQ(GTPIN_DI_ERROR_INVALID_ARGUMENT, retFromGtPin); + isInitialized = gtpinIsGTPinInitialized(); + EXPECT_FALSE(isInitialized); retFromGtPin = GTPin_Init(>pinCallbacks, nullptr, nullptr); EXPECT_EQ(GTPIN_DI_ERROR_INVALID_ARGUMENT, retFromGtPin); + isInitialized = gtpinIsGTPinInitialized(); + EXPECT_FALSE(isInitialized); retFromGtPin = GTPin_Init(nullptr, &driverServices, nullptr); EXPECT_EQ(GTPIN_DI_ERROR_INVALID_ARGUMENT, retFromGtPin); + isInitialized = gtpinIsGTPinInitialized(); + EXPECT_FALSE(isInitialized); } TEST_F(GTPinTests, givenIncompleteArgumentsThenGTPinInitFails) { @@ -178,6 +234,8 @@ TEST_F(GTPinTests, givenInvalidArgumentsWhenVersionArgumentIsProvidedThenGTPinIn } TEST_F(GTPinTests, givenValidAndCompleteArgumentsThenGTPinInitSucceeds) { + bool isInitialized = false; + gtpinCallbacks.onContextCreate = OnContextCreate; gtpinCallbacks.onContextDestroy = OnContextDestroy; gtpinCallbacks.onKernelCreate = OnKernelCreate; @@ -190,6 +248,8 @@ TEST_F(GTPinTests, givenValidAndCompleteArgumentsThenGTPinInitSucceeds) { EXPECT_EQ(&OCLRT::gtpinFreeBuffer, driverServices.bufferDeallocate); EXPECT_EQ(&OCLRT::gtpinMapBuffer, driverServices.bufferMap); EXPECT_EQ(&OCLRT::gtpinUnmapBuffer, driverServices.bufferUnMap); + isInitialized = gtpinIsGTPinInitialized(); + EXPECT_TRUE(isInitialized); } TEST_F(GTPinTests, givenValidAndCompleteArgumentsWhenGTPinIsAlreadyInitializedThenGTPinInitFails) { @@ -574,6 +634,7 @@ TEST_F(GTPinTests, givenUninitializedGTPinInterfaceThenGTPinKernelCreateCallback EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(prevCount, KernelCreateCallbackCount); + // Cleanup retVal = clReleaseKernel(kernel); EXPECT_EQ(CL_SUCCESS, retVal); @@ -583,7 +644,7 @@ TEST_F(GTPinTests, givenUninitializedGTPinInterfaceThenGTPinKernelCreateCallback deleteDataReadFromFile(pSource); } -TEST_F(GTPinTests, givenInitializedGTPinInterfaceThenGTPinKernelCreateCallbackIsCalled) { +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCallbacksAreCalled) { gtpinCallbacks.onContextCreate = OnContextCreate; gtpinCallbacks.onContextDestroy = OnContextDestroy; gtpinCallbacks.onKernelCreate = OnKernelCreate; @@ -593,12 +654,16 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceThenGTPinKernelCreateCallbackIs retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); - cl_kernel kernel = nullptr; + cl_kernel kernel1 = nullptr; + cl_kernel kernel2 = nullptr; cl_program pProgram = nullptr; cl_device_id device = (cl_device_id)pDevice; void *pSource = nullptr; size_t sourceSize = 0; std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); testFile.append(clFiles); @@ -607,8 +672,16 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceThenGTPinKernelCreateCallbackIs EXPECT_NE(0u, sourceSize); EXPECT_NE(nullptr, pSource); + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + pProgram = clCreateProgramWithSource( - (cl_context)((Context *)pContext), + context, 1, (const char **)&pSource, &sourceSize, @@ -624,17 +697,1077 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceThenGTPinKernelCreateCallbackIs nullptr); EXPECT_EQ(CL_SUCCESS, retVal); + // Create and submit first instance of "CopyBuffer" kernel + int prevCount11 = KernelCreateCallbackCount; + kernel1 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount11 + 1, KernelCreateCallbackCount); + + Kernel *pKernel1 = (Kernel *)kernel1; + const KernelInfo &kInfo1 = pKernel1->getKernelInfo(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + EXPECT_EQ(kInfo1.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId1); + + constexpr size_t n = 256; + auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel1, 0, sizeof(cl_mem), &buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel1, 1, sizeof(cl_mem), &buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount12 = KernelSubmitCallbackCount; + int prevCount13 = CommandBufferCreateCallbackCount; + int prevCount14 = CommandBufferCompleteCallbackCount; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {n, 1, 1}; + size_t localWorkSize[3] = {1, 1, 1}; + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel1, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount12 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount13 + 1, CommandBufferCreateCallbackCount); + + // Create and submit second instance of "CopyBuffer" kernel + int prevCount21 = KernelCreateCallbackCount; + kernel2 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + // Verify that GT-Pin Kernel Create callback is not called multiple times for the same kernel + EXPECT_EQ(prevCount21, KernelCreateCallbackCount); + + Kernel *pKernel2 = (Kernel *)kernel2; + const KernelInfo &kInfo2 = pKernel2->getKernelInfo(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + EXPECT_EQ(kInfo2.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId2); + + auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff21 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel2, 0, sizeof(cl_mem), &buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel2, 1, sizeof(cl_mem), &buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount22 = KernelSubmitCallbackCount; + int prevCount23 = CommandBufferCreateCallbackCount; + int prevCount24 = CommandBufferCompleteCallbackCount; + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel2, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount22 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount23 + 1, CommandBufferCreateCallbackCount); + + retVal = clFinish(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount14 + 2, CommandBufferCompleteCallbackCount); + EXPECT_EQ(prevCount24 + 2, CommandBufferCompleteCallbackCount); + + auto taskCount0 = castToObject(buff10)->getCompletionStamp().taskCount; + auto taskCount1 = castToObject(buff11)->getCompletionStamp().taskCount; + EXPECT_EQ(1u, taskCount0); + EXPECT_EQ(1u, taskCount1); + + taskCount0 = castToObject(buff20)->getCompletionStamp().taskCount; + taskCount1 = castToObject(buff21)->getCompletionStamp().taskCount; + EXPECT_EQ(2u, taskCount0); + EXPECT_EQ(2u, taskCount1); + + // Cleanup + retVal = clReleaseKernel(kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseKernel(kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseMemObject(buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenKernelCreateCallbacksIsNotCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_device_id device = (cl_device_id)pDevice; + cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + auto pContext = castToObject(context); + + // Prepare a kernel without SSH + char binary[1024] = {1, 2, 3, 4, 5, 6, 7, 8, 9, '\0'}; + size_t binSize = 10; + Program *pProgram = Program::createFromGenBinary(pContext, &binary[0], binSize, false, &retVal); + ASSERT_NE(nullptr, pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + char *pBin = &binary[0]; + SProgramBinaryHeader *pBHdr = (SProgramBinaryHeader *)pBin; + pBHdr->Magic = iOpenCL::MAGIC_CL; + pBHdr->Version = iOpenCL::CURRENT_ICBE_VERSION; + pBHdr->Device = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + pBHdr->GPUPointerSizeInBytes = 8; + pBHdr->NumberOfKernels = 1; + pBHdr->SteppingId = 0; + pBHdr->PatchListSize = 0; + pBin += sizeof(SProgramBinaryHeader); + binSize += sizeof(SProgramBinaryHeader); + + SKernelBinaryHeaderCommon *pKHdr = (SKernelBinaryHeaderCommon *)pBin; + pKHdr->CheckSum = 0; + pKHdr->ShaderHashCode = 0; + pKHdr->KernelNameSize = 4; + pKHdr->PatchListSize = 0; + pKHdr->KernelHeapSize = 16; + pKHdr->GeneralStateHeapSize = 0; + pKHdr->DynamicStateHeapSize = 0; + pKHdr->SurfaceStateHeapSize = 0; + pKHdr->KernelUnpaddedSize = 0; + pBin += sizeof(SKernelBinaryHeaderCommon); + binSize += sizeof(SKernelBinaryHeaderCommon); + char *pKernelBin = pBin; + + strcpy(pBin, "Tst"); + pBin += pKHdr->KernelNameSize; + binSize += pKHdr->KernelNameSize; + + strcpy(pBin, "fake_ISA_code__"); + binSize += pKHdr->KernelHeapSize; + + uint32_t kernelBinSize = + pKHdr->DynamicStateHeapSize + + pKHdr->GeneralStateHeapSize + + pKHdr->KernelHeapSize + + pKHdr->KernelNameSize + + pKHdr->PatchListSize + + pKHdr->SurfaceStateHeapSize; + uint64_t hashValue = Hash::hash(reinterpret_cast(pKernelBin), kernelBinSize); + pKHdr->CheckSum = static_cast(hashValue & 0xFFFFFFFF); + + pProgram->storeGenBinary(&binary[0], binSize); + retVal = pProgram->processGenBinary(); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Verify that GT-Pin Kernel Create callback is not called int prevCount = KernelCreateCallbackCount; + cl_kernel kernel = clCreateKernel(pProgram, "Tst", &retVal); + EXPECT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount, KernelCreateCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithExecEnvIsUsedThenKernelCreateCallbacksIsNotCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_device_id device = (cl_device_id)pDevice; + cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + auto pContext = castToObject(context); + + // Prepare a kernel with fake Execution Environment + char binary[1024] = {1, 2, 3, 4, 5, 6, 7, 8, 9, '\0'}; + size_t binSize = 10; + Program *pProgram = Program::createFromGenBinary(pContext, &binary[0], binSize, false, &retVal); + ASSERT_NE(nullptr, pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + char *pBin = &binary[0]; + SProgramBinaryHeader *pBHdr = (SProgramBinaryHeader *)pBin; + pBHdr->Magic = iOpenCL::MAGIC_CL; + pBHdr->Version = iOpenCL::CURRENT_ICBE_VERSION; + pBHdr->Device = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + pBHdr->GPUPointerSizeInBytes = 8; + pBHdr->NumberOfKernels = 1; + pBHdr->SteppingId = 0; + pBHdr->PatchListSize = 0; + pBin += sizeof(SProgramBinaryHeader); + binSize += sizeof(SProgramBinaryHeader); + + SKernelBinaryHeaderCommon *pKHdr = (SKernelBinaryHeaderCommon *)pBin; + pKHdr->CheckSum = 0; + pKHdr->ShaderHashCode = 0; + pKHdr->KernelNameSize = 4; + pKHdr->PatchListSize = sizeof(SPatchExecutionEnvironment) + sizeof(SPatchBindingTableState); + pKHdr->KernelHeapSize = 16; + pKHdr->GeneralStateHeapSize = 0; + pKHdr->DynamicStateHeapSize = 0; + pKHdr->SurfaceStateHeapSize = 64; + pKHdr->KernelUnpaddedSize = 0; + pBin += sizeof(SKernelBinaryHeaderCommon); + binSize += sizeof(SKernelBinaryHeaderCommon); + char *pKernelBin = pBin; + + strcpy(pBin, "Tst"); + pBin += pKHdr->KernelNameSize; + binSize += pKHdr->KernelNameSize; + + strcpy(pBin, "fake_ISA_code__"); + pBin += pKHdr->KernelHeapSize; + binSize += pKHdr->KernelHeapSize; + + memset(pBin, 0, pKHdr->SurfaceStateHeapSize); + pBin += pKHdr->SurfaceStateHeapSize; + binSize += pKHdr->SurfaceStateHeapSize; + + SPatchExecutionEnvironment *pPatch1 = (SPatchExecutionEnvironment *)pBin; + pPatch1->Token = iOpenCL::PATCH_TOKEN_EXECUTION_ENVIRONMENT; + pPatch1->Size = sizeof(iOpenCL::SPatchExecutionEnvironment); + pPatch1->RequiredWorkGroupSizeX = 0; + pPatch1->RequiredWorkGroupSizeY = 0; + pPatch1->RequiredWorkGroupSizeZ = 0; + pPatch1->LargestCompiledSIMDSize = 8; + pPatch1->CompiledSubGroupsNumber = 0; + pPatch1->HasBarriers = 0; + pPatch1->DisableMidThreadPreemption = 0; + pPatch1->CompiledSIMD8 = 1; + pPatch1->CompiledSIMD16 = 0; + pPatch1->CompiledSIMD32 = 0; + pPatch1->HasDeviceEnqueue = 1; + pPatch1->MayAccessUndeclaredResource = 0; + pPatch1->UsesFencesForReadWriteImages = 0; + pPatch1->UsesStatelessSpillFill = 0; + pPatch1->IsCoherent = 0; + pPatch1->IsInitializer = 0; + pPatch1->IsFinalizer = 0; + pPatch1->SubgroupIndependentForwardProgressRequired = 0; + pPatch1->CompiledForGreaterThan4GBBuffers = 0; + pBin += sizeof(SPatchExecutionEnvironment); + binSize += sizeof(SPatchExecutionEnvironment); + + SPatchBindingTableState *pPatch2 = (SPatchBindingTableState *)pBin; + pPatch2->Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE; + pPatch2->Size = sizeof(iOpenCL::SPatchBindingTableState); + pPatch2->Offset = 0; + pPatch2->Count = 1; + pPatch2->SurfaceStateOffset = 0; + binSize += sizeof(SPatchBindingTableState); + + uint32_t kernelBinSize = + pKHdr->DynamicStateHeapSize + + pKHdr->GeneralStateHeapSize + + pKHdr->KernelHeapSize + + pKHdr->KernelNameSize + + pKHdr->PatchListSize + + pKHdr->SurfaceStateHeapSize; + uint64_t hashValue = Hash::hash(reinterpret_cast(pKernelBin), kernelBinSize); + pKHdr->CheckSum = static_cast(hashValue & 0xFFFFFFFF); + + pProgram->storeGenBinary(&binary[0], binSize); + retVal = pProgram->processGenBinary(); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Verify that GT-Pin Kernel Create callback is not called + int prevCount = KernelCreateCallbackCount; + cl_kernel kernel = clCreateKernel(pProgram, "Tst", &retVal); + EXPECT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount, KernelCreateCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenGTPinSubmitKernelCallbackIsNotCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount1 = KernelCreateCallbackCount; kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); EXPECT_NE(nullptr, kernel); EXPECT_EQ(CL_SUCCESS, retVal); - EXPECT_EQ(prevCount + 1, KernelCreateCallbackCount); + EXPECT_EQ(prevCount1 + 1, KernelCreateCallbackCount); Kernel *pKernel = (Kernel *)kernel; const KernelInfo &kInfo = pKernel->getKernelInfo(); uint64_t gtpinKernelId = pKernel->getKernelId(); EXPECT_EQ(kInfo.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId); + constexpr size_t n = 256; + auto buff0 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff1 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel, 0, sizeof(cl_mem), &buff0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel, 1, sizeof(cl_mem), &buff1); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Verify that when SSH is removed then during kernel execution + // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called. + pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0); + + int prevCount2 = KernelSubmitCallbackCount; + int prevCount3 = CommandBufferCreateCallbackCount; + int prevCount4 = CommandBufferCompleteCallbackCount; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {n, 1, 1}; + size_t localWorkSize[3] = {1, 1, 1}; + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount2, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount3, CommandBufferCreateCallbackCount); + + retVal = clFinish(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount4, CommandBufferCompleteCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseMemObject(buff0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff1); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUsedThenGTPinSubmitKernelCallbackIsNotCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount1 = KernelCreateCallbackCount; + kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount1 + 1, KernelCreateCallbackCount); + + Kernel *pKernel = (Kernel *)kernel; + const KernelInfo &kInfo = pKernel->getKernelInfo(); + uint64_t gtpinKernelId = pKernel->getKernelId(); + EXPECT_EQ(kInfo.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId); + + constexpr size_t n = 256; + auto buff0 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff1 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel, 0, sizeof(cl_mem), &buff0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel, 1, sizeof(cl_mem), &buff1); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Verify that when SSH is removed then during kernel execution + // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called. + pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0); + + cl_event userEvent = clCreateUserEvent(context, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount2 = KernelSubmitCallbackCount; + int prevCount3 = CommandBufferCreateCallbackCount; + int prevCount4 = CommandBufferCompleteCallbackCount; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {n, 1, 1}; + size_t localWorkSize[3] = {1, 1, 1}; + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 1, &userEvent, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount2, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount3, CommandBufferCreateCallbackCount); + + retVal = clSetUserEventStatus(userEvent, CL_COMPLETE); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clFinish(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount4, CommandBufferCompleteCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseMemObject(buff0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff1); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseEvent(userEvent); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenTheSameKerneIsExecutedTwiceThenGTPinCreateKernelCallbackIsCalledOnce) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_kernel kernel1 = nullptr; + cl_kernel kernel2 = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Kernel "CopyBuffer" - called for the first time + int prevCount11 = KernelCreateCallbackCount; + kernel1 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount11 + 1, KernelCreateCallbackCount); + + Kernel *pKernel1 = (Kernel *)kernel1; + const KernelInfo &kInfo1 = pKernel1->getKernelInfo(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + EXPECT_EQ(kInfo1.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId1); + + constexpr size_t n = 256; + auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel1, 0, sizeof(cl_mem), &buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel1, 1, sizeof(cl_mem), &buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + + cl_event userEvent = clCreateUserEvent(context, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount12 = KernelSubmitCallbackCount; + int prevCount13 = CommandBufferCreateCallbackCount; + int prevCount14 = CommandBufferCompleteCallbackCount; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {n, 1, 1}; + size_t localWorkSize[3] = {1, 1, 1}; + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel1, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 1, &userEvent, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount12 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount13 + 1, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount14, CommandBufferCompleteCallbackCount); + + // The same kernel "CopyBuffer" - called second time + int prevCount21 = KernelCreateCallbackCount; + kernel2 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + // Verify that Kernel Create callback was not called now + EXPECT_EQ(prevCount21, KernelCreateCallbackCount); + + Kernel *pKernel2 = (Kernel *)kernel2; + const KernelInfo &kInfo2 = pKernel2->getKernelInfo(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + EXPECT_EQ(kInfo2.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId2); + + auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff21 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel2, 0, sizeof(cl_mem), &buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel2, 1, sizeof(cl_mem), &buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount22 = KernelSubmitCallbackCount; + int prevCount23 = CommandBufferCreateCallbackCount; + int prevCount24 = CommandBufferCompleteCallbackCount; + EXPECT_EQ(prevCount14, prevCount24); + retVal = clEnqueueNDRangeKernel(cmdQ, pKernel2, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount22 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount23 + 1, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount14, CommandBufferCompleteCallbackCount); + EXPECT_EQ(prevCount24, CommandBufferCompleteCallbackCount); + EXPECT_EQ(prevCount14, prevCount24); + + clSetUserEventStatus(userEvent, CL_COMPLETE); + + retVal = clFinish(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + // Verify that both kernel instances were completed + EXPECT_EQ(prevCount14 + 2, CommandBufferCompleteCallbackCount); + EXPECT_EQ(prevCount24 + 2, CommandBufferCompleteCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseKernel(kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseMemObject(buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseEvent(userEvent); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsCreatedThenAllKernelSubmitRelatedNotificationsAreCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + kernelExecQueue.clear(); + + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create kernel + int prevCount1 = KernelCreateCallbackCount; + kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); + ASSERT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount1 + 1, KernelCreateCallbackCount); + + // Simulate that created kernel was sent for execution + auto pKernel = castToObject(kernel); + auto pCmdQueue = castToObject(cmdQ); + ASSERT_NE(nullptr, pKernel); + EXPECT_EQ(0u, kernelExecQueue.size()); + EXPECT_EQ(0u, kernelResources.size()); + int prevCount2 = CommandBufferCreateCallbackCount; + int prevCount3 = KernelSubmitCallbackCount; + gtpinNotifyKernelSubmit(kernel, pCmdQueue); + EXPECT_EQ(prevCount2 + 1, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount3 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_EQ(1u, kernelResources.size()); + EXPECT_EQ(pKernel, kernelExecQueue[0].pKernel); + EXPECT_EQ(kernelResources[0], (resource_handle_t)kernelExecQueue[0].gtpinResource); + EXPECT_EQ(pCmdQueue, kernelExecQueue[0].pCommandQueue); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + EXPECT_FALSE(kernelExecQueue[0].isResourceResident); + + // Verify that if kernel unknown to GT-Pin is about to be flushed + // then its residency vector does not obtain GT-Pin resource + std::vector residencyVector; + EXPECT_EQ(0u, residencyVector.size()); + gtpinNotifyUpdateResidencyList(nullptr, &residencyVector); + EXPECT_EQ(0u, residencyVector.size()); + EXPECT_FALSE(kernelExecQueue[0].isResourceResident); + + // Verify that if kernel known to GT-Pin is about to be flushed + // then its residency vector obtains GT-Pin resource + EXPECT_EQ(0u, residencyVector.size()); + gtpinNotifyUpdateResidencyList(pKernel, &residencyVector); + EXPECT_EQ(1u, residencyVector.size()); + GeneralSurface *pSurf = (GeneralSurface *)residencyVector[0]; + delete pSurf; + residencyVector.clear(); + EXPECT_TRUE(kernelExecQueue[0].isResourceResident); + kernelExecQueue[0].isResourceResident = false; + + // Create second kernel ... + cl_kernel kernel2 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + ASSERT_NE(nullptr, kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + // ... and simulate that it was sent for execution + auto pKernel2 = castToObject(kernel2); + ASSERT_NE(nullptr, pKernel2); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_EQ(1u, kernelResources.size()); + int prevCount22 = CommandBufferCreateCallbackCount; + int prevCount23 = KernelSubmitCallbackCount; + gtpinNotifyKernelSubmit(kernel2, pCmdQueue); + EXPECT_EQ(prevCount22 + 1, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount23 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(2u, kernelExecQueue.size()); + EXPECT_EQ(2u, kernelResources.size()); + EXPECT_EQ(pKernel2, kernelExecQueue[1].pKernel); + EXPECT_EQ(kernelResources[1], (resource_handle_t)kernelExecQueue[1].gtpinResource); + EXPECT_EQ(pCmdQueue, kernelExecQueue[1].pCommandQueue); + EXPECT_FALSE(kernelExecQueue[1].isTaskCountValid); + EXPECT_FALSE(kernelExecQueue[1].isResourceResident); + + // Verify that correct GT-Pin resource is made resident + cl_mem gtpinBuffer0 = kernelExecQueue[0].gtpinResource; + auto pBuffer0 = castToObject(gtpinBuffer0); + GraphicsAllocation *pGfxAlloc0 = pBuffer0->getGraphicsAllocation(); + cl_mem gtpinBuffer1 = kernelExecQueue[1].gtpinResource; + auto pBuffer1 = castToObject(gtpinBuffer1); + GraphicsAllocation *pGfxAlloc1 = pBuffer1->getGraphicsAllocation(); + CommandStreamReceiver &csr = pCmdQueue->getDevice().getCommandStreamReceiver(); + EXPECT_FALSE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + gtpinNotifyMakeResident(pKernel, &csr); + EXPECT_TRUE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + + // Cancel information about second submitted kernel + kernelExecQueue.pop_back(); + EXPECT_EQ(1u, kernelExecQueue.size()); + kernelResources.pop_back(); + EXPECT_EQ(1u, kernelResources.size()); + gtpinUnmapBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer1); + gtpinFreeBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer1); + retVal = clReleaseKernel(kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Verify that if flush occurs on another queue then our kernel is not flushed to CSR + uint32_t taskCount = 11; + gtpinNotifyPreFlushTask(nullptr); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + gtpinNotifyFlushTask(taskCount); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + + // Verify that if flush occurs on current queue then our kernel is flushed to CSR + gtpinNotifyPreFlushTask(pCmdQueue); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + gtpinNotifyFlushTask(taskCount); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_TRUE(kernelExecQueue[0].isTaskCountValid); + EXPECT_EQ(taskCount, kernelExecQueue[0].taskCount); + + // Verify that if previous task was completed then it does not affect our kernel + uint32_t taskCompleted = taskCount - 1; + int prevCount4 = CommandBufferCompleteCallbackCount; + gtpinNotifyTaskCompletion(taskCompleted); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_EQ(1u, kernelResources.size()); + EXPECT_EQ(prevCount4, CommandBufferCompleteCallbackCount); + + // Verify that if current task was completed then it is our kernel + gtpinNotifyTaskCompletion(taskCompleted + 1); + EXPECT_EQ(0u, kernelExecQueue.size()); + EXPECT_EQ(0u, kernelResources.size()); + EXPECT_EQ(prevCount4 + 1, CommandBufferCompleteCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenOneKernelIsSubmittedSeveralTimesThenCorrectBuffersAreMadeResident) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + kernelExecQueue.clear(); + + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create kernel + int prevCount1 = KernelCreateCallbackCount; + kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); + ASSERT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount1 + 1, KernelCreateCallbackCount); + + // Simulate that created kernel was sent for execution two times in a row + auto pKernel = castToObject(kernel); + auto pCmdQueue = castToObject(cmdQ); + ASSERT_NE(nullptr, pKernel); + EXPECT_EQ(0u, kernelExecQueue.size()); + EXPECT_EQ(0u, kernelResources.size()); + int prevCount2 = CommandBufferCreateCallbackCount; + int prevCount3 = KernelSubmitCallbackCount; + // First kernel submission + gtpinNotifyKernelSubmit(kernel, pCmdQueue); + EXPECT_EQ(prevCount2 + 1, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount3 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(1u, kernelExecQueue.size()); + EXPECT_EQ(1u, kernelResources.size()); + EXPECT_EQ(pKernel, kernelExecQueue[0].pKernel); + EXPECT_EQ(kernelResources[0], (resource_handle_t)kernelExecQueue[0].gtpinResource); + EXPECT_EQ(pCmdQueue, kernelExecQueue[0].pCommandQueue); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + EXPECT_FALSE(kernelExecQueue[0].isResourceResident); + // Second kernel submission + gtpinNotifyKernelSubmit(kernel, pCmdQueue); + EXPECT_EQ(prevCount2 + 2, CommandBufferCreateCallbackCount); + EXPECT_EQ(prevCount3 + 2, KernelSubmitCallbackCount); + EXPECT_EQ(2u, kernelExecQueue.size()); + EXPECT_EQ(2u, kernelResources.size()); + EXPECT_EQ(pKernel, kernelExecQueue[0].pKernel); + EXPECT_EQ(kernelResources[0], (resource_handle_t)kernelExecQueue[0].gtpinResource); + EXPECT_EQ(pCmdQueue, kernelExecQueue[0].pCommandQueue); + EXPECT_FALSE(kernelExecQueue[0].isTaskCountValid); + EXPECT_FALSE(kernelExecQueue[0].isResourceResident); + EXPECT_EQ(pKernel, kernelExecQueue[1].pKernel); + EXPECT_EQ(kernelResources[1], (resource_handle_t)kernelExecQueue[1].gtpinResource); + EXPECT_EQ(pCmdQueue, kernelExecQueue[1].pCommandQueue); + EXPECT_FALSE(kernelExecQueue[1].isTaskCountValid); + EXPECT_FALSE(kernelExecQueue[1].isResourceResident); + + // Verify that correct GT-Pin resource is made resident. + // This simulates enqueuing non-blocked kernels + cl_mem gtpinBuffer0 = kernelExecQueue[0].gtpinResource; + auto pBuffer0 = castToObject(gtpinBuffer0); + GraphicsAllocation *pGfxAlloc0 = pBuffer0->getGraphicsAllocation(); + cl_mem gtpinBuffer1 = kernelExecQueue[1].gtpinResource; + auto pBuffer1 = castToObject(gtpinBuffer1); + GraphicsAllocation *pGfxAlloc1 = pBuffer1->getGraphicsAllocation(); + CommandStreamReceiver &csr = pCmdQueue->getDevice().getCommandStreamReceiver(); + // Make resident resource of first submitted kernel + EXPECT_FALSE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + gtpinNotifyMakeResident(pKernel, &csr); + EXPECT_TRUE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + // Make resident resource of second submitted kernel + gtpinNotifyMakeResident(pKernel, &csr); + EXPECT_TRUE(pGfxAlloc0->isResident()); + EXPECT_TRUE(pGfxAlloc1->isResident()); + + // Verify that correct GT-Pin resource is added to residency list. + // This simulates enqueuing blocked kernels + kernelExecQueue[0].isResourceResident = false; + kernelExecQueue[1].isResourceResident = false; + pGfxAlloc0->residencyTaskCount = ObjectNotResident; + pGfxAlloc1->residencyTaskCount = ObjectNotResident; + EXPECT_FALSE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + std::vector residencyVector; + EXPECT_EQ(0u, residencyVector.size()); + // Add to residency list resource of first submitted kernel + gtpinNotifyUpdateResidencyList(pKernel, &residencyVector); + EXPECT_EQ(1u, residencyVector.size()); + // Make resident first resource on residency list + GeneralSurface *pSurf1 = (GeneralSurface *)residencyVector[0]; + pSurf1->makeResident(csr); + EXPECT_TRUE(pGfxAlloc0->isResident()); + EXPECT_FALSE(pGfxAlloc1->isResident()); + // Add to residency list resource of second submitted kernel + gtpinNotifyUpdateResidencyList(pKernel, &residencyVector); + EXPECT_EQ(2u, residencyVector.size()); + // Make resident second resource on residency list + GeneralSurface *pSurf2 = (GeneralSurface *)residencyVector[1]; + pSurf2->makeResident(csr); + EXPECT_TRUE(pGfxAlloc0->isResident()); + EXPECT_TRUE(pGfxAlloc1->isResident()); + + // Cleanup + delete pSurf1; + delete pSurf2; + residencyVector.clear(); + + kernelExecQueue.pop_back(); + EXPECT_EQ(1u, kernelExecQueue.size()); + kernelResources.pop_back(); + EXPECT_EQ(1u, kernelResources.size()); + gtpinUnmapBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer1); + gtpinFreeBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer1); + + kernelExecQueue.pop_back(); + EXPECT_EQ(0u, kernelExecQueue.size()); + kernelResources.pop_back(); + EXPECT_EQ(0u, kernelResources.size()); + gtpinUnmapBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer0); + gtpinFreeBuffer((context_handle_t)context, (resource_handle_t)gtpinBuffer0); + retVal = clReleaseKernel(kernel); EXPECT_EQ(CL_SUCCESS, retVal); @@ -642,6 +1775,298 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceThenGTPinKernelCreateCallbackIs EXPECT_EQ(CL_SUCCESS, retVal); deleteDataReadFromFile(pSource); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenLowMemoryConditionOccursThenKernelCreationFails) { + + InjectedFunction allocBufferFunc = [this](size_t failureIndex) { + cl_device_id device = (cl_device_id)pDevice; + cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + auto pContext = castToObject(context); + + // Prepare a program with one kernel having Stateless Private Surface + char binary[1024] = {1, 2, 3, 4, 5, 6, 7, 8, 9, '\0'}; + size_t binSize = 10; + Program *pProgram = Program::createFromGenBinary(pContext, &binary[0], binSize, false, &retVal); + ASSERT_NE(nullptr, pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + char *pBin = &binary[0]; + SProgramBinaryHeader *pBHdr = (SProgramBinaryHeader *)pBin; + pBHdr->Magic = iOpenCL::MAGIC_CL; + pBHdr->Version = iOpenCL::CURRENT_ICBE_VERSION; + pBHdr->Device = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + pBHdr->GPUPointerSizeInBytes = 8; + pBHdr->NumberOfKernels = 1; + pBHdr->SteppingId = 0; + pBHdr->PatchListSize = 0; + pBin += sizeof(SProgramBinaryHeader); + binSize += sizeof(SProgramBinaryHeader); + + SKernelBinaryHeaderCommon *pKHdr = (SKernelBinaryHeaderCommon *)pBin; + pKHdr->CheckSum = 0; + pKHdr->ShaderHashCode = 0; + pKHdr->KernelNameSize = 4; + pKHdr->PatchListSize = sizeof(SPatchAllocateStatelessPrivateSurface); + pKHdr->KernelHeapSize = 16; + pKHdr->GeneralStateHeapSize = 0; + pKHdr->DynamicStateHeapSize = 0; + pKHdr->SurfaceStateHeapSize = 0; + pKHdr->KernelUnpaddedSize = 0; + pBin += sizeof(SKernelBinaryHeaderCommon); + binSize += sizeof(SKernelBinaryHeaderCommon); + char *pKernelBin = pBin; + + strcpy(pBin, "Tst"); + pBin += pKHdr->KernelNameSize; + binSize += pKHdr->KernelNameSize; + + strcpy(pBin, "fake_ISA_code__"); + pBin += pKHdr->KernelHeapSize; + binSize += pKHdr->KernelHeapSize; + + SPatchAllocateStatelessPrivateSurface *pPatch = (SPatchAllocateStatelessPrivateSurface *)pBin; + pPatch->Token = iOpenCL::PATCH_TOKEN_ALLOCATE_STATELESS_PRIVATE_MEMORY; + pPatch->Size = sizeof(iOpenCL::SPatchAllocateStatelessPrivateSurface); + pPatch->SurfaceStateHeapOffset = 0; + pPatch->DataParamOffset = 0; + pPatch->DataParamSize = 0; + pPatch->PerThreadPrivateMemorySize = 4; + binSize += sizeof(SPatchAllocateStatelessPrivateSurface); + + uint32_t kernelBinSize = + pKHdr->DynamicStateHeapSize + + pKHdr->GeneralStateHeapSize + + pKHdr->KernelHeapSize + + pKHdr->KernelNameSize + + pKHdr->PatchListSize + + pKHdr->SurfaceStateHeapSize; + uint64_t hashValue = Hash::hash(reinterpret_cast(pKernelBin), kernelBinSize); + pKHdr->CheckSum = static_cast(hashValue & 0xFFFFFFFF); + + pProgram->storeGenBinary(&binary[0], binSize); + retVal = pProgram->processGenBinary(); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create kernels from program + cl_kernel kernels[2] = {0}; + cl_uint numCreatedKernels = 0; + retVal = clCreateKernelsInProgram(pProgram, 0, &kernels[0], &numCreatedKernels); + + if (nonfailingAllocation != failureIndex) { + EXPECT_EQ(nullptr, kernels[0]); + EXPECT_EQ(1u, numCreatedKernels); + } else { + EXPECT_NE(nullptr, kernels[0]); + EXPECT_EQ(1u, numCreatedKernels); + clReleaseKernel(kernels[0]); + } + + clReleaseProgram(pProgram); + clReleaseContext(context); + }; + + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + ASSERT_EQ(&OCLRT::gtpinCreateBuffer, driverServices.bufferAllocate); + ASSERT_EQ(&OCLRT::gtpinFreeBuffer, driverServices.bufferDeallocate); + EXPECT_EQ(&OCLRT::gtpinMapBuffer, driverServices.bufferMap); + EXPECT_EQ(&OCLRT::gtpinUnmapBuffer, driverServices.bufferUnMap); + + injectFailures(allocBufferFunc); +} + +TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) { + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create kernel + kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); + ASSERT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + Kernel *pKernel = castToObject(kernel); + ASSERT_NE(nullptr, pKernel); + + size_t numBTS1 = pKernel->getNumberOfBindingTableStates(); + EXPECT_EQ(2u, numBTS1); + size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(); + EXPECT_NE(0u, sizeSurfaceStates1); + size_t offsetBTS1 = pKernel->getBindingTableOffset(); + EXPECT_NE(0u, offsetBTS1); + + GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily); + void *pSS1 = gtpinHelper.getSurfaceState(pKernel, 0); + EXPECT_NE(nullptr, pSS1); + + // Enlarge SSH by one SURFACE STATE element + bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel); + EXPECT_TRUE(surfaceAdded); + + size_t numBTS2 = pKernel->getNumberOfBindingTableStates(); + EXPECT_EQ(numBTS1 + 1, numBTS2); + size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(); + EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1); + size_t offsetBTS2 = pKernel->getBindingTableOffset(); + EXPECT_GT(offsetBTS2, offsetBTS1); + + void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0); + EXPECT_NE(pSS2, pSS1); + + pSS2 = gtpinHelper.getSurfaceState(pKernel, numBTS2); + EXPECT_EQ(nullptr, pSS2); + + // Remove kernel's SSH + pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0); + + // Try to enlarge SSH once again, this time the operation must fail + surfaceAdded = gtpinHelper.addSurfaceState(pKernel); + EXPECT_FALSE(surfaceAdded); + + size_t numBTS3 = pKernel->getNumberOfBindingTableStates(); + EXPECT_EQ(0u, numBTS3); + size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(); + EXPECT_EQ(0u, sizeSurfaceStates3); + size_t offsetBTS3 = pKernel->getBindingTableOffset(); + EXPECT_EQ(0u, offsetBTS3); + void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0); + EXPECT_EQ(nullptr, pSS3); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, givenKernelThenVerifyThatKernelCodeSubstitutionWorksWell) { + cl_kernel kernel = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + void *pSource = nullptr; + size_t sourceSize = 0; + std::string testFile; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + sourceSize = loadDataFromFile(testFile.c_str(), pSource); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + pProgram = clCreateProgramWithSource( + context, + 1, + (const char **)&pSource, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create kernel + kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal); + ASSERT_NE(nullptr, kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + Kernel *pKernel = castToObject(kernel); + ASSERT_NE(nullptr, pKernel); + + bool isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(); + EXPECT_FALSE(isKernelCodeSubstituted); + + // Substitute new kernel code + constexpr size_t newCodeSize = 64; + uint8_t newCode[newCodeSize]; + pKernel->substituteKernelHeap(&newCode[0], newCodeSize); + + // Verify that substitution went properly + isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(); + EXPECT_TRUE(isKernelCodeSubstituted); + uint8_t *pBin2 = reinterpret_cast(const_cast(pKernel->getKernelHeap())); + EXPECT_EQ(pBin2, &newCode[0]); + + // Cleanup + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + deleteDataReadFromFile(pSource); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(GTPinTests, checkWhetherGTPinHwHelperGetterWorksWell) { + GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily; + GTPinHwHelper *pGTPinHelper = >PinHwHelper::get(genFamily); + EXPECT_NE(nullptr, pGTPinHelper); } } // namespace ULT diff --git a/unit_tests/helpers/base_object_tests.cpp b/unit_tests/helpers/base_object_tests.cpp index be09102abd..7589eb4a67 100644 --- a/unit_tests/helpers/base_object_tests.cpp +++ b/unit_tests/helpers/base_object_tests.cpp @@ -324,6 +324,20 @@ TEST(BaseObjectTest, takeOwnership) { EXPECT_FALSE(buffer.hasOwnership()); } +TEST(BaseObjectTest, takeOwnershipWrapper) { + MockBuffer buffer; + { + TakeOwnershipWrapper bufferOwnership(buffer, false); + EXPECT_FALSE(buffer.hasOwnership()); + } + { + TakeOwnershipWrapper bufferOwnership(buffer, true); + EXPECT_TRUE(buffer.hasOwnership()); + bufferOwnership.unlock(); + EXPECT_FALSE(buffer.hasOwnership()); + } +} + TYPED_TEST(BaseObjectTests, getCond) { TypeParam *object = new TypeParam; diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp index 6b10d3d67e..35e8d4ea26 100644 --- a/unit_tests/helpers/kernel_commands_tests.cpp +++ b/unit_tests/helpers/kernel_commands_tests.cpp @@ -486,7 +486,7 @@ HWTEST_F(KernelCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAn auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); // Initialize binding table state pointers with pattern - EXPECT_EQ(numSurfaces, pKernel->getNumberOfSurfaceStates()); + EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates()); const size_t localWorkSizes[3]{256, 1, 1}; @@ -566,7 +566,7 @@ HWTEST_F(KernelCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequiri auto usedBefore = ssh.getUsed(); // Initialize binding table state pointers with pattern - auto numSurfaceStates = pKernel->getNumberOfSurfaceStates(); + auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(); EXPECT_EQ(0u, numSurfaceStates); // set binding table states @@ -613,10 +613,13 @@ HWTEST_F(KernelCommandsTest, setBindingTableStatesForNoSurfaces) { auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); // Initialize binding table state pointers with pattern - auto numSurfaceStates = pKernel->getNumberOfSurfaceStates(); + auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(); EXPECT_EQ(0u, numSurfaceStates); - auto dstBindingTablePointer = KernelCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernel); + auto dstBindingTablePointer = KernelCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo); + EXPECT_EQ(0u, dstBindingTablePointer); + + dstBindingTablePointer = KernelCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernel); EXPECT_EQ(0u, dstBindingTablePointer); SPatchBindingTableState bindingTableState; diff --git a/unit_tests/utilities/CMakeLists.txt b/unit_tests/utilities/CMakeLists.txt index ca8a414dbe..21e98ea576 100644 --- a/unit_tests/utilities/CMakeLists.txt +++ b/unit_tests/utilities/CMakeLists.txt @@ -31,6 +31,7 @@ set(IGDRCL_SRCS_tests_utilities "${CMAKE_CURRENT_SOURCE_DIR}/timer_util_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/cpuinfo_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/spinlock_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp" PARENT_SCOPE ) diff --git a/unit_tests/utilities/spinlock_tests.cpp b/unit_tests/utilities/spinlock_tests.cpp new file mode 100644 index 0000000000..a3f8cb041a --- /dev/null +++ b/unit_tests/utilities/spinlock_tests.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "runtime/utilities/spinlock.h" +#include "gtest/gtest.h" +#include + +using namespace OCLRT; + +TEST(SpinLockTest, givenTwoThreadsThenVerifyThatTheySynchronizeWithSpinLock) { + std::atomic_flag syncLock = ATOMIC_FLAG_INIT; + std::atomic threadStarted(false); + std::atomic threadFinished(false); + SpinLock lock1; + int sharedCount = 0; + + // Initially acquire spin lock so the worker thread will wait + lock1.enter(syncLock); + + // Start worker thread + std::thread t([&]() { + threadStarted = true; + SpinLock lock2; + lock2.enter(syncLock); + sharedCount++; + EXPECT_EQ(2, sharedCount); + lock2.leave(syncLock); + threadFinished = true; + }); + + // Wait till worker thread is started + while (!threadStarted) { + }; + sharedCount++; + EXPECT_EQ(1, sharedCount); + + // Release spin lock thus allowing worker thread to proceed + lock1.leave(syncLock); + + // Wait till worker thread finishes + while (!threadFinished) { + }; + EXPECT_EQ(2, sharedCount); + t.join(); +}