From 82bc594af08dfbdbc35ac00ad7cebcb6a6b2b7a3 Mon Sep 17 00:00:00 2001 From: Filip Hazubski Date: Tue, 19 Nov 2019 16:54:47 +0100 Subject: [PATCH] Add clEnqueueNDRangeKernelINTEL API Related-To: NEO-2712 Change-Id: If1d16d9d626871a9dc4b19282f9edc5786ffa398 Signed-off-by: Filip Hazubski --- core/program/CMakeLists.txt | 2 + core/program/sync_buffer_handler.cpp | 54 ++++++ core/program/sync_buffer_handler.h | 42 +++++ runtime/api/api.cpp | 78 +++++++- runtime/api/api.h | 10 ++ runtime/command_queue/enqueue_common.h | 10 ++ .../patchtokens_decoder.cpp | 4 + .../compiler_interface/patchtokens_decoder.h | 1 + runtime/device/device.cpp | 11 ++ runtime/device/device.h | 8 +- runtime/kernel/kernel.cpp | 19 ++ runtime/kernel/kernel.h | 2 + runtime/program/kernel_info.cpp | 5 + runtime/program/kernel_info.h | 1 + .../program/kernel_info_from_patchtokens.cpp | 1 + runtime/program/patch_info.h | 2 + runtime/program/process_gen_binary.cpp | 1 + ...on_function_address_for_platform_tests.inl | 10 ++ ...l_get_extension_function_address_tests.inl | 10 ++ unit_tests/command_queue/CMakeLists.txt | 5 +- .../command_queue/enqueue_kernel_1_tests.cpp | 151 ++++++++++++++++ .../sync_buffer_handler_tests.cpp | 167 ++++++++++++++++++ unit_tests/fixtures/kernel_data_fixture.cpp | 1 - unit_tests/fixtures/kernel_data_fixture.h | 15 +- unit_tests/gtpin/gtpin_tests.cpp | 149 ++++++++++++++++ unit_tests/program/kernel_data.cpp | 19 ++ 26 files changed, 764 insertions(+), 14 deletions(-) create mode 100644 core/program/sync_buffer_handler.cpp create mode 100644 core/program/sync_buffer_handler.h create mode 100644 unit_tests/command_queue/sync_buffer_handler_tests.cpp diff --git a/core/program/CMakeLists.txt b/core/program/CMakeLists.txt index 2a102955b0..ff9da333de 100644 --- a/core/program/CMakeLists.txt +++ b/core/program/CMakeLists.txt @@ -8,6 +8,8 @@ set(NEO_CORE_PROGRAM ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/print_formatter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/print_formatter.h + ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h ) set_property(GLOBAL PROPERTY NEO_CORE_PROGRAM ${NEO_CORE_PROGRAM}) diff --git a/core/program/sync_buffer_handler.cpp b/core/program/sync_buffer_handler.cpp new file mode 100644 index 0000000000..433068ad2c --- /dev/null +++ b/core/program/sync_buffer_handler.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "core/program/sync_buffer_handler.h" + +#include "core/memory_manager/graphics_allocation.h" +#include "runtime/command_stream/command_stream_receiver.h" +#include "runtime/kernel/kernel.h" +#include "runtime/memory_manager/memory_manager.h" + +namespace NEO { + +SyncBufferHandler::~SyncBufferHandler() { + memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); +}; +SyncBufferHandler::SyncBufferHandler(Device &device) + : device(device), memoryManager(*device.getMemoryManager()) { + + allocateNewBuffer(); +} + +void SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, Kernel &kernel, CommandStreamReceiver &csr) { + auto requiredSize = workGroupsCount; + std::lock_guard guard(this->mutex); + + bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize); + if (isCurrentBufferFull) { + memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); + allocateNewBuffer(); + usedBufferSize = 0; + } + + kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize); + csr.makeResident(*graphicsAllocation); + + usedBufferSize += requiredSize; +} + +void SyncBufferHandler::allocateNewBuffer() { + AllocationProperties allocationProperties{device.getRootDeviceIndex(), true, bufferSize, + GraphicsAllocation::AllocationType::LINEAR_STREAM, + false, false, static_cast(device.getDeviceBitfield().to_ulong())}; + graphicsAllocation = memoryManager.allocateGraphicsMemoryWithProperties(allocationProperties); + UNRECOVERABLE_IF(graphicsAllocation == nullptr); + + auto cpuPointer = graphicsAllocation->getUnderlyingBuffer(); + std::memset(cpuPointer, 0, bufferSize); +} + +} // namespace NEO diff --git a/core/program/sync_buffer_handler.h b/core/program/sync_buffer_handler.h new file mode 100644 index 0000000000..244c75ec13 --- /dev/null +++ b/core/program/sync_buffer_handler.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "core/helpers/basic_math.h" + +#include + +namespace NEO { + +class CommandStreamReceiver; +class Context; +class Device; +class GraphicsAllocation; +class MemoryManager; +class Kernel; + +class SyncBufferHandler { + public: + ~SyncBufferHandler(); + + SyncBufferHandler(Device &device); + + void prepareForEnqueue(size_t workGroupsCount, Kernel &kernel, CommandStreamReceiver &csr); + + protected: + void allocateNewBuffer(); + + Device &device; + MemoryManager &memoryManager; + GraphicsAllocation *graphicsAllocation; + const size_t bufferSize = 64 * KB; + size_t usedBufferSize = 0; + std::mutex mutex; +}; + +} // namespace NEO diff --git a/runtime/api/api.cpp b/runtime/api/api.cpp index ee0b67bb8f..7b4c6ce60b 100644 --- a/runtime/api/api.cpp +++ b/runtime/api/api.cpp @@ -3135,10 +3135,11 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue, "event", DebugManager.getEvents(reinterpret_cast(event), 1)); CommandQueue *pCommandQueue = nullptr; + Kernel *pKernel = nullptr; retVal = validateObjects( WithCastToInternal(commandQueue, &pCommandQueue), - kernel, + WithCastToInternal(kernel, &pKernel), EventWaitList(numEventsInWaitList, eventWaitList)); if (CL_SUCCESS != retVal) { @@ -3146,7 +3147,12 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue, return retVal; } - auto pKernel = castToObjectOrAbort(kernel); + if (pKernel->getKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr) { + retVal = CL_INVALID_KERNEL; + TRACING_EXIT(clEnqueueNDRangeKernel, &retVal); + return retVal; + } + TakeOwnershipWrapper kernelOwnership(*pKernel, gtpinIsGTPinInitialized()); if (gtpinIsGTPinInitialized()) { gtpinNotifyKernelSubmit(kernel, pCommandQueue); @@ -3947,6 +3953,7 @@ void *CL_API_CALL clGetExtensionFunctionAddress(const char *funcName) { RETURN_FUNC_PTR_IF_EXIST(clGetDeviceFunctionPointerINTEL); RETURN_FUNC_PTR_IF_EXIST(clGetDeviceGlobalVariablePointerINTEL); RETURN_FUNC_PTR_IF_EXIST(clGetExecutionInfoINTEL); + RETURN_FUNC_PTR_IF_EXIST(clEnqueueNDRangeKernelINTEL); void *ret = sharingFactory.getExtensionFunctionAddress(funcName); if (ret != nullptr) { @@ -5197,3 +5204,70 @@ cl_int CL_API_CALL clGetExecutionInfoINTEL(cl_command_queue commandQueue, return retVal; } + +cl_int CL_API_CALL clEnqueueNDRangeKernelINTEL(cl_command_queue commandQueue, + cl_kernel kernel, + cl_uint workDim, + const size_t *globalWorkOffset, + const size_t *workgroupCount, + const size_t *localWorkSize, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *event) { + cl_int retVal = CL_SUCCESS; + API_ENTER(&retVal); + DBG_LOG_INPUTS("commandQueue", commandQueue, "cl_kernel", kernel, + "globalWorkOffset[0]", DebugManager.getInput(globalWorkOffset, 0), + "globalWorkOffset[1]", DebugManager.getInput(globalWorkOffset, 1), + "globalWorkOffset[2]", DebugManager.getInput(globalWorkOffset, 2), + "workgroupCount", DebugManager.getSizes(workgroupCount, workDim, false), + "localWorkSize", DebugManager.getSizes(localWorkSize, workDim, true), + "numEventsInWaitList", numEventsInWaitList, + "eventWaitList", DebugManager.getEvents(reinterpret_cast(eventWaitList), numEventsInWaitList), + "event", DebugManager.getEvents(reinterpret_cast(event), 1)); + + CommandQueue *pCommandQueue = nullptr; + Kernel *pKernel = nullptr; + + retVal = validateObjects( + WithCastToInternal(commandQueue, &pCommandQueue), + WithCastToInternal(kernel, &pKernel), + EventWaitList(numEventsInWaitList, eventWaitList)); + + if (CL_SUCCESS != retVal) { + return retVal; + } + + size_t globalWorkSize[3]; + size_t requestedNumberOfWorkgroups = 1; + for (size_t i = 0; i < workDim; i++) { + globalWorkSize[i] = workgroupCount[i] * localWorkSize[i]; + requestedNumberOfWorkgroups *= workgroupCount[i]; + } + + size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize); + if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) { + retVal = CL_INVALID_VALUE; + return retVal; + } + + TakeOwnershipWrapper kernelOwnership(*pKernel, gtpinIsGTPinInitialized()); + if (gtpinIsGTPinInitialized()) { + gtpinNotifyKernelSubmit(kernel, pCommandQueue); + } + + pCommandQueue->getDevice().allocateSyncBufferHandler(); + + retVal = pCommandQueue->enqueueKernel( + kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + + DBG_LOG_INPUTS("event", DebugManager.getEvents(reinterpret_cast(event), 1u)); + return retVal; +} diff --git a/runtime/api/api.h b/runtime/api/api.h index 7d89a6ace3..f72a6daec5 100644 --- a/runtime/api/api.h +++ b/runtime/api/api.h @@ -1036,6 +1036,16 @@ cl_int CL_API_CALL clGetExecutionInfoINTEL( void *paramValue, size_t *paramValueSizeRet); +cl_int CL_API_CALL clEnqueueNDRangeKernelINTEL(cl_command_queue commandQueue, + cl_kernel kernel, + cl_uint workDim, + const size_t *globalWorkOffset, + const size_t *workgroupCount, + const size_t *localWorkSize, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *event); + // OpenCL 2.2 cl_int CL_API_CALL clSetProgramSpecializationConstant( diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index c2da74d899..847766d956 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -6,6 +6,7 @@ */ #pragma once +#include "core/program/sync_buffer_handler.h" #include "core/utilities/range.h" #include "runtime/built_ins/built_ins.h" #include "runtime/built_ins/builtins_dispatch_builder.h" @@ -642,6 +643,15 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( blocking = true; printfHandler->makeResident(getGpgpuCommandStreamReceiver()); } + + if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { + auto &gws = multiDispatchInfo.begin()->getGWS(); + auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); + size_t workGroupsCount = (gws.x * gws.y * gws.z) / + (lws.x * lws.y * lws.z); + device->syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel(), getGpgpuCommandStreamReceiver()); + } + if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); diff --git a/runtime/compiler_interface/patchtokens_decoder.cpp b/runtime/compiler_interface/patchtokens_decoder.cpp index 85486883c7..dc6b666fa0 100644 --- a/runtime/compiler_interface/patchtokens_decoder.cpp +++ b/runtime/compiler_interface/patchtokens_decoder.cpp @@ -407,6 +407,10 @@ inline bool decodeToken(const SPatchItemHeader *token, KernelFromPatchtokens &ou auto tokDataP = reinterpret_cast(token); decodeKernelDataParameterToken(tokDataP, out); } break; + + case PATCH_TOKEN_ALLOCATE_SYNC_BUFFER: { + assignToken(out.tokens.allocateSyncBuffer, token); + } break; } return out.decodeStatus != DecoderError::InvalidBinary; diff --git a/runtime/compiler_interface/patchtokens_decoder.h b/runtime/compiler_interface/patchtokens_decoder.h index 5caf5971d0..501b8c4e95 100644 --- a/runtime/compiler_interface/patchtokens_decoder.h +++ b/runtime/compiler_interface/patchtokens_decoder.h @@ -129,6 +129,7 @@ struct KernelFromPatchtokens { const SPatchAllocateStatelessPrintfSurface *allocateStatelessPrintfSurface = nullptr; const SPatchAllocateStatelessEventPoolSurface *allocateStatelessEventPoolSurface = nullptr; const SPatchAllocateStatelessDefaultDeviceQueueSurface *allocateStatelessDefaultDeviceQueueSurface = nullptr; + const SPatchAllocateSyncBuffer *allocateSyncBuffer = nullptr; const SPatchItemHeader *inlineVmeSamplerInfo = nullptr; const SPatchGtpinFreeGRFInfo *gtpinFreeGrfInfo = nullptr; const SPatchStateSIP *stateSip = nullptr; diff --git a/runtime/device/device.cpp b/runtime/device/device.cpp index 9de794bfd3..1e84be3cfa 100644 --- a/runtime/device/device.cpp +++ b/runtime/device/device.cpp @@ -9,6 +9,7 @@ #include "core/command_stream/preemption.h" #include "core/helpers/hw_helper.h" +#include "core/program/sync_buffer_handler.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_stream/experimental_command_buffer.h" #include "runtime/device/device_vector.h" @@ -60,6 +61,7 @@ Device::Device(ExecutionEnvironment *executionEnvironment) Device::~Device() { DEBUG_BREAK_IF(nullptr == executionEnvironment->memoryManager.get()); + syncBufferHandler.reset(); if (performanceCounters) { performanceCounters->shutdown(); } @@ -203,6 +205,15 @@ double Device::getPlatformHostTimerResolution() const { return osTime->getHostTimerResolution(); return 0.0; } + +void Device::allocateSyncBufferHandler() { + TakeOwnershipWrapper lock(*this); + if (syncBufferHandler.get() == nullptr) { + syncBufferHandler = std::make_unique(*this); + UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr); + } +} + GFXCORE_FAMILY Device::getRenderCoreFamily() const { return this->getHardwareInfo().platform.eRenderCoreFamily; } diff --git a/runtime/device/device.h b/runtime/device/device.h index 505c5bcfa4..b5eb297dee 100644 --- a/runtime/device/device.h +++ b/runtime/device/device.h @@ -18,8 +18,9 @@ #include "engine_node.h" namespace NEO { -class OSTime; class DriverInfo; +class OSTime; +class SyncBufferHandler; template <> struct OpenCLObjectMapper<_cl_device_id> { @@ -72,6 +73,7 @@ class Device : public BaseObject<_cl_device_id> { double getPlatformHostTimerResolution() const; bool isSimulation() const; GFXCORE_FAMILY getRenderCoreFamily() const; + void allocateSyncBufferHandler(); PerformanceCounters *getPerformanceCounters() { return performanceCounters.get(); } PreemptionMode getPreemptionMode() const { return preemptionMode; } MOCKABLE_VIRTUAL bool isSourceLevelDebuggerActive() const; @@ -88,8 +90,10 @@ class Device : public BaseObject<_cl_device_id> { virtual uint32_t getRootDeviceIndex() const = 0; virtual uint32_t getNumAvailableDevices() const = 0; virtual Device *getDeviceById(uint32_t deviceId) const = 0; + virtual DeviceBitfield getDeviceBitfield() const = 0; static decltype(&PerformanceCounters::create) createPerformanceCountersFunc; + std::unique_ptr syncBufferHandler; protected: Device() = delete; @@ -113,8 +117,6 @@ class Device : public BaseObject<_cl_device_id> { bool createEngine(uint32_t deviceCsrIndex, aub_stream::EngineType engineType); MOCKABLE_VIRTUAL std::unique_ptr createCommandStreamReceiver() const; - virtual DeviceBitfield getDeviceBitfield() const = 0; - std::vector simultaneousInterops; unsigned int enabledClVersion = 0u; std::string deviceExtensions; diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp index 8fad647bb5..df01b6c2a2 100644 --- a/runtime/kernel/kernel.cpp +++ b/runtime/kernel/kernel.cpp @@ -2205,6 +2205,25 @@ void Kernel::patchBlocksSimdSize() { } } +bool Kernel::usesSyncBuffer() { + return (kernelInfo.patchInfo.pAllocateSyncBuffer != nullptr); +} + +void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) { + auto &patchInfo = kernelInfo.patchInfo; + auto bufferPatchAddress = ptrOffset(getCrossThreadData(), patchInfo.pAllocateSyncBuffer->DataParamOffset); + patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize, + ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)); + + if (requiresSshForBuffers()) { + auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap()), + patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset); + auto addressToPatch = gfxAllocation->getUnderlyingBuffer(); + auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize(); + Buffer::setSurfaceState(&device, surfaceState, sizeToPatch, addressToPatch, gfxAllocation); + } +} + template void Kernel::patchReflectionSurface(DeviceQueue *, PrintfHandler *); bool Kernel::isPatched() const { diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index fde592acce..e65ed76f66 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -218,6 +218,8 @@ class Kernel : public BaseObject<_cl_kernel> { void patchDefaultDeviceQueue(DeviceQueue *devQueue); void patchEventPool(DeviceQueue *devQueue); void patchBlocksSimdSize(); + bool usesSyncBuffer(); + void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset); GraphicsAllocation *getKernelReflectionSurface() const { return kernelReflectionSurface; diff --git a/runtime/program/kernel_info.cpp b/runtime/program/kernel_info.cpp index ac6c8fafe6..a34328bc75 100644 --- a/runtime/program/kernel_info.cpp +++ b/runtime/program/kernel_info.cpp @@ -411,6 +411,11 @@ void KernelInfo::storePatchToken(const SPatchAllocateSystemThreadSurface *pSyste patchInfo.pAllocateSystemThreadSurface = pSystemThreadSurface; } +void KernelInfo::storePatchToken(const SPatchAllocateSyncBuffer *pAllocateSyncBuffer) { + usesSsh |= true; + patchInfo.pAllocateSyncBuffer = pAllocateSyncBuffer; +} + cl_int KernelInfo::resolveKernelInfo() { cl_int retVal = CL_SUCCESS; std::unordered_map::iterator iterUint; diff --git a/runtime/program/kernel_info.h b/runtime/program/kernel_info.h index cd11bdfb5a..2d0ecb140e 100644 --- a/runtime/program/kernel_info.h +++ b/runtime/program/kernel_info.h @@ -115,6 +115,7 @@ struct KernelInfo { void storePatchToken(const SPatchString *pStringArg); void storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo); void storePatchToken(const SPatchAllocateSystemThreadSurface *pSystemThreadSurface); + void storePatchToken(const SPatchAllocateSyncBuffer *pAllocateSyncBuffer); GraphicsAllocation *getGraphicsAllocation() const { return this->kernelAllocation; } cl_int resolveKernelInfo(); void resizeKernelArgInfoAndRegisterParameter(uint32_t argCount) { diff --git a/runtime/program/kernel_info_from_patchtokens.cpp b/runtime/program/kernel_info_from_patchtokens.cpp index 8cd1ae99d2..a892190270 100644 --- a/runtime/program/kernel_info_from_patchtokens.cpp +++ b/runtime/program/kernel_info_from_patchtokens.cpp @@ -145,6 +145,7 @@ void populateKernelInfo(KernelInfo &dst, const PatchTokenBinary::KernelFromPatch storeTokenIfNotNull(dst, src.tokens.allocateStatelessPrintfSurface); storeTokenIfNotNull(dst, src.tokens.allocateStatelessEventPoolSurface); storeTokenIfNotNull(dst, src.tokens.allocateStatelessDefaultDeviceQueueSurface); + storeTokenIfNotNull(dst, src.tokens.allocateSyncBuffer); for (auto &str : src.tokens.strings) { dst.storePatchToken(str); diff --git a/runtime/program/patch_info.h b/runtime/program/patch_info.h index 9929c7c9dc..f4596ec361 100644 --- a/runtime/program/patch_info.h +++ b/runtime/program/patch_info.h @@ -23,6 +23,7 @@ using iOpenCL::SPatchAllocateStatelessEventPoolSurface; using iOpenCL::SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization; using iOpenCL::SPatchAllocateStatelessPrintfSurface; using iOpenCL::SPatchAllocateStatelessPrivateSurface; +using iOpenCL::SPatchAllocateSyncBuffer; using iOpenCL::SPatchAllocateSystemThreadSurface; using iOpenCL::SPatchBindingTableState; using iOpenCL::SPatchDataParameterBuffer; @@ -66,6 +67,7 @@ struct PatchInfo { const SPatchExecutionEnvironment *executionEnvironment = nullptr; const SPatchKernelAttributesInfo *pKernelAttributesInfo = nullptr; const SPatchAllocateStatelessPrivateSurface *pAllocateStatelessPrivateSurface = nullptr; + const SPatchAllocateSyncBuffer *pAllocateSyncBuffer = nullptr; const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pAllocateStatelessConstantMemorySurfaceWithInitialization = nullptr; const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pAllocateStatelessGlobalMemorySurfaceWithInitialization = nullptr; const SPatchAllocateStatelessPrintfSurface *pAllocateStatelessPrintfSurface = nullptr; diff --git a/runtime/program/process_gen_binary.cpp b/runtime/program/process_gen_binary.cpp index 511796dcb5..66d098fb24 100644 --- a/runtime/program/process_gen_binary.cpp +++ b/runtime/program/process_gen_binary.cpp @@ -79,6 +79,7 @@ void Program::populateKernelInfo( if (retVal != CL_SUCCESS) { return; } + kernelInfo->gpuPointerSize = decodedProgram.header->GPUPointerSizeInBytes; if (decodedKernel.tokens.programSymbolTable) { diff --git a/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl b/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl index 37c9a03af2..5ba0f228ab 100644 --- a/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl +++ b/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl @@ -77,4 +77,14 @@ TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetTracingStateINTE auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetTracingStateINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetTracingStateINTEL)); } + +TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetExecutionInfoINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetExecutionInfoINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clGetExecutionInfoINTEL)); +} + +TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClEnqueueNDRangeKernelINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clEnqueueNDRangeKernelINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clEnqueueNDRangeKernelINTEL)); +} } // namespace ULT diff --git a/unit_tests/api/cl_get_extension_function_address_tests.inl b/unit_tests/api/cl_get_extension_function_address_tests.inl index 9376245978..3326af0139 100644 --- a/unit_tests/api/cl_get_extension_function_address_tests.inl +++ b/unit_tests/api/cl_get_extension_function_address_tests.inl @@ -162,4 +162,14 @@ TEST_F(clGetExtensionFunctionAddressTests, GivenClGetDeviceFunctionPointerINTELW auto retVal = clGetExtensionFunctionAddress("clGetDeviceFunctionPointerINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetDeviceFunctionPointerINTEL)); } + +TEST_F(clGetExtensionFunctionAddressTests, GivenClGetExecutionInfoINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddress("clGetExecutionInfoINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clGetExecutionInfoINTEL)); +} + +TEST_F(clGetExtensionFunctionAddressTests, GivenClEnqueueNDRangeKernelINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddress("clEnqueueNDRangeKernelINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clEnqueueNDRangeKernelINTEL)); +} } // namespace ULT diff --git a/unit_tests/command_queue/CMakeLists.txt b/unit_tests/command_queue/CMakeLists.txt index d58edd6b4d..5f8917bfcd 100644 --- a/unit_tests/command_queue/CMakeLists.txt +++ b/unit_tests/command_queue/CMakeLists.txt @@ -34,11 +34,11 @@ set(IGDRCL_SRCS_tests_command_queue ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_handler_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_1_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_2_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_event_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_global_offset_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_local_work_size_tests.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_1_tests.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_2_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_two_ioq_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_two_ooq_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_kernel_two_walker_ioq_tests.cpp @@ -89,6 +89,7 @@ set(IGDRCL_SRCS_tests_command_queue ${CMAKE_CURRENT_SOURCE_DIR}/ooq_task_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ooq_task_tests_mt.cpp ${CMAKE_CURRENT_SOURCE_DIR}/read_write_buffer_cpu_copy.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/work_group_size_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/zero_size_enqueue_tests.cpp ) diff --git a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp index 6e74b75dd2..6276b72066 100644 --- a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp @@ -6,6 +6,7 @@ */ #include "core/unit_tests/helpers/debug_manager_state_restore.h" +#include "runtime/api/api.h" #include "runtime/built_ins/builtins_dispatch_builder.h" #include "unit_tests/command_queue/enqueue_fixture.h" #include "unit_tests/fixtures/hello_world_fixture.h" @@ -187,6 +188,156 @@ TEST_F(EnqueueKernelTest, GivenInvalidWorkGroupSizeWhenEnqueuingKernelThenInvali EXPECT_EQ(CL_INVALID_WORK_GROUP_SIZE, retVal); } +TEST_F(EnqueueKernelTest, GivenNullKernelWhenEnqueuingKernelINTELThenInvalidKernelErrorIsReturned) { + size_t workgroupCount[3] = {1, 1, 1}; + auto retVal = clEnqueueNDRangeKernelINTEL( + pCmdQ, + nullptr, + 1, + nullptr, + workgroupCount, + nullptr, + 0, + nullptr, + nullptr); + + EXPECT_EQ(CL_INVALID_KERNEL, retVal); +} + +TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDRangeKernelINTELReturnsSuccess) { + const size_t n = 512; + size_t workgroupCount[3] = {2, 1, 1}; + size_t localWorkSize[3] = {256, 1, 1}; + cl_int retVal = CL_SUCCESS; + CommandQueue *pCmdQ2 = createCommandQueue(pDevice); + + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto b0 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + auto b1 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 0, sizeof(cl_mem), &b0); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 1, sizeof(cl_mem), &b1); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseMemObject(b0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(b1); + EXPECT_EQ(CL_SUCCESS, retVal); + clReleaseCommandQueue(pCmdQ2); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalledTwiceThenClEnqueueNDRangeKernelINTELReturnsError) { + const size_t n = 512; + size_t workgroupCount[3] = {2, 1, 1}; + size_t localWorkSize[3] = {256, 1, 1}; + cl_int retVal = CL_SUCCESS; + CommandQueue *pCmdQ2 = createCommandQueue(pDevice); + + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto b0 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + auto b1 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 0, sizeof(cl_mem), &b0); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 0, sizeof(cl_mem), &b1); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clReleaseMemObject(b0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(b1); + EXPECT_EQ(CL_SUCCESS, retVal); + clReleaseCommandQueue(pCmdQ2); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeastFailsThenClEnqueueNDRangeKernelINTELReturnsError) { + const size_t n = 512; + size_t workgroupCount[3] = {2, 1, 1}; + size_t localWorkSize[3] = {256, 1, 1}; + cl_int retVal = CL_SUCCESS; + CommandQueue *pCmdQ2 = createCommandQueue(pDevice); + + std::unique_ptr kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto b0 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + auto b1 = clCreateBuffer(context, 0, n * sizeof(float), nullptr, nullptr); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 0, sizeof(cl_mem), &b0); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clSetKernelArg(kernel.get(), 1, 2 * sizeof(cl_mem), &b1); + EXPECT_NE(CL_SUCCESS, retVal); + + EXPECT_FALSE(kernel->isPatched()); + retVal = clEnqueueNDRangeKernelINTEL(pCmdQ2, kernel.get(), 1, nullptr, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL_ARGS, retVal); + + retVal = clReleaseMemObject(b0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(b1); + EXPECT_EQ(CL_SUCCESS, retVal); + clReleaseCommandQueue(pCmdQ2); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(EnqueueKernelTest, GivenInvalidEventListCountWhenEnqueuingKernelINTELThenInvalidEventWaitListErrorIsReturned) { + size_t workgroupCount[3] = {1, 1, 1}; + + auto retVal = clEnqueueNDRangeKernelINTEL( + pCmdQ, + pKernel, + 1, + nullptr, + workgroupCount, + nullptr, + 1, + nullptr, + nullptr); + + EXPECT_EQ(CL_INVALID_EVENT_WAIT_LIST, retVal); +} + HWTEST_F(EnqueueKernelTest, bumpsTaskLevel) { auto taskLevelBefore = pCmdQ->taskLevel; callOneWorkItemNDRKernel(); diff --git a/unit_tests/command_queue/sync_buffer_handler_tests.cpp b/unit_tests/command_queue/sync_buffer_handler_tests.cpp new file mode 100644 index 0000000000..7dbeb88be3 --- /dev/null +++ b/unit_tests/command_queue/sync_buffer_handler_tests.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "core/program/sync_buffer_handler.h" +#include "runtime/api/api.h" +#include "test.h" +#include "unit_tests/fixtures/enqueue_handler_fixture.h" +#include "unit_tests/mocks/mock_command_queue.h" +#include "unit_tests/mocks/mock_kernel.h" +#include "unit_tests/mocks/mock_mdi.h" + +using namespace NEO; + +class MockSyncBufferHandler : public SyncBufferHandler { + public: + using SyncBufferHandler::bufferSize; + using SyncBufferHandler::graphicsAllocation; + using SyncBufferHandler::usedBufferSize; +}; + +class SyncBufferHandlerTest : public EnqueueHandlerTest { + public: + void SetUp() override {} + void TearDown() override {} + + template + void SetUpT() { + EnqueueHandlerTest::SetUp(); + kernelInternals = std::make_unique(*pDevice, context); + kernel = kernelInternals->mockKernel; + commandQueue = reinterpret_cast(new MockCommandQueueHw(context, pDevice, 0)); + } + + template + void TearDownT() { + commandQueue->release(); + kernelInternals.reset(); + EnqueueHandlerTest::TearDown(); + } + + void patchAllocateSyncBuffer() { + sPatchAllocateSyncBuffer.SurfaceStateHeapOffset = 0; + sPatchAllocateSyncBuffer.DataParamOffset = 0; + sPatchAllocateSyncBuffer.DataParamSize = sizeof(uint8_t); + kernelInternals->kernelInfo.patchInfo.pAllocateSyncBuffer = &sPatchAllocateSyncBuffer; + } + + MockSyncBufferHandler *getSyncBufferHandler() { + return reinterpret_cast(pDevice->syncBufferHandler.get()); + } + + const cl_uint workDim = 1; + const size_t gwOffset[3] = {0, 0, 0}; + const size_t lws[3] = {10, 1, 1}; + size_t workgroupCount[3] = {10, 1, 1}; + size_t workItemsCount = 10; + std::unique_ptr kernelInternals; + MockKernel *kernel; + MockCommandQueue *commandQueue; + SPatchAllocateSyncBuffer sPatchAllocateSyncBuffer; +}; + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchWhenEnqueuingKernelThenSyncBufferIsUsed) { + patchAllocateSyncBuffer(); + clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + + auto syncBufferHandler = getSyncBufferHandler(); + EXPECT_EQ(workItemsCount, syncBufferHandler->usedBufferSize); + + commandQueue->flush(); + EXPECT_EQ(syncBufferHandler->graphicsAllocation->getTaskCount( + pDevice->getUltCommandStreamReceiver().getOsContext().getContextId()), + pDevice->getUltCommandStreamReceiver().latestSentTaskCount); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenNoAllocateSyncBufferPatchWhenEnqueuingKernelThenSyncBufferIsNotUsedAndUsedBufferSizeIsNotUpdated) { + clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + + auto syncBufferHandler = getSyncBufferHandler(); + EXPECT_EQ(0u, syncBufferHandler->usedBufferSize); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingKernelThenSuccessIsReturned) { + auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws); + workgroupCount[0] = maxWorkGroupCount; + auto retVal = clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingKernelThenErrorIsReturned) { + size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws); + workgroupCount[0] = maxWorkGroupCount + 1; + auto retVal = clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_VALUE, retVal); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSyncBufferFullWhenEnqueuingKernelThenNewBufferIsAllocated) { + patchAllocateSyncBuffer(); + clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + + auto syncBufferHandler = getSyncBufferHandler(); + syncBufferHandler->usedBufferSize = syncBufferHandler->bufferSize; + clEnqueueNDRangeKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr); + EXPECT_EQ(workItemsCount, syncBufferHandler->usedBufferSize); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBufferThenSshIsProperlyPatched) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + kernelInternals->kernelInfo.usesSsh = true; + kernelInternals->kernelInfo.requiresSshForBuffers = true; + patchAllocateSyncBuffer(); + + pDevice->allocateSyncBufferHandler(); + auto syncBufferHandler = getSyncBufferHandler(); + auto surfaceState = reinterpret_cast(ptrOffset(kernel->getSurfaceStateHeap(), + sPatchAllocateSyncBuffer.SurfaceStateHeapOffset)); + auto surfaceAddress = surfaceState->getSurfaceBaseAddress(); + auto bufferAddress = syncBufferHandler->graphicsAllocation->getGpuAddress(); + EXPECT_NE(bufferAddress, surfaceAddress); + + kernel->patchSyncBuffer(commandQueue->getDevice(), syncBufferHandler->graphicsAllocation, syncBufferHandler->usedBufferSize); + surfaceAddress = surfaceState->getSurfaceBaseAddress(); + EXPECT_EQ(bufferAddress, surfaceAddress); +} + +HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenKernelUsingSyncBufferWhenUsingStandardEnqueueThenErrorIsReturned) { + patchAllocateSyncBuffer(); + + size_t globalWorkSize[3] = {workgroupCount[0] * lws[0], workgroupCount[1] * lws[1], workgroupCount[2] * lws[2]}; + auto retVal = clEnqueueNDRangeKernel(commandQueue, kernel, workDim, gwOffset, globalWorkSize, lws, 0, nullptr, nullptr); + EXPECT_EQ(CL_INVALID_KERNEL, retVal); +} + +TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) { + const size_t testUsedBufferSize = 100; + MockDevice rootDevice; + rootDevice.allocateSyncBufferHandler(); + auto syncBufferHandler = reinterpret_cast(rootDevice.syncBufferHandler.get()); + + ASSERT_NE(syncBufferHandler->usedBufferSize, testUsedBufferSize); + syncBufferHandler->usedBufferSize = testUsedBufferSize; + + rootDevice.allocateSyncBufferHandler(); + syncBufferHandler = reinterpret_cast(rootDevice.syncBufferHandler.get()); + + EXPECT_EQ(testUsedBufferSize, syncBufferHandler->usedBufferSize); +} + +TEST(SyncBufferHandlerDeviceTest, GivenSubDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) { + const size_t testUsedBufferSize = 100; + MockDevice rootDevice; + std::unique_ptr subDevice{reinterpret_cast(rootDevice.createSubDevice(0))}; + subDevice->allocateSyncBufferHandler(); + auto syncBufferHandler = reinterpret_cast(subDevice->syncBufferHandler.get()); + + ASSERT_NE(syncBufferHandler->usedBufferSize, testUsedBufferSize); + syncBufferHandler->usedBufferSize = testUsedBufferSize; + + subDevice->allocateSyncBufferHandler(); + syncBufferHandler = reinterpret_cast(subDevice->syncBufferHandler.get()); + + EXPECT_EQ(testUsedBufferSize, syncBufferHandler->usedBufferSize); +} diff --git a/unit_tests/fixtures/kernel_data_fixture.cpp b/unit_tests/fixtures/kernel_data_fixture.cpp index 9391c7a647..0a781db8eb 100644 --- a/unit_tests/fixtures/kernel_data_fixture.cpp +++ b/unit_tests/fixtures/kernel_data_fixture.cpp @@ -29,7 +29,6 @@ void KernelDataTest::buildAndDecode() { kernelNameSize + sshSize + dshSize + gshSize + kernelHeapSize + patchListSize; kernelDataSize += sizeof(SPatchDataParameterStream); - program->setDevice(pDevice); pKernelData = static_cast(alignedMalloc(kernelDataSize, MemoryConstants::cacheLineSize)); ASSERT_NE(nullptr, pKernelData); diff --git a/unit_tests/fixtures/kernel_data_fixture.h b/unit_tests/fixtures/kernel_data_fixture.h index 0dc1d67669..3d3560575e 100644 --- a/unit_tests/fixtures/kernel_data_fixture.h +++ b/unit_tests/fixtures/kernel_data_fixture.h @@ -7,8 +7,10 @@ #pragma once +#include "runtime/device/device.h" +#include "runtime/memory_manager/memory_manager.h" #include "runtime/program/kernel_info.h" -#include "unit_tests/mocks/mock_device.h" +#include "unit_tests/mocks/mock_context.h" #include "unit_tests/mocks/mock_program.h" #include "gtest/gtest.h" @@ -49,16 +51,17 @@ class KernelDataTest : public testing::Test { protected: void SetUp() override { kernelBinaryHeader.KernelNameSize = kernelNameSize; - pDevice = MockDevice::createWithNewExecutionEnvironment(nullptr); - program = std::make_unique(*pDevice->getExecutionEnvironment()); + pContext = new MockContext; + program = std::make_unique(*pContext->getDevice(0)->getExecutionEnvironment(), pContext, false); } void TearDown() override { if (pKernelInfo->kernelAllocation) { - pDevice->getMemoryManager()->freeGraphicsMemory(pKernelInfo->kernelAllocation); + pContext->getDevice(0)->getMemoryManager()->freeGraphicsMemory(pKernelInfo->kernelAllocation); const_cast(pKernelInfo)->kernelAllocation = nullptr; } - delete pDevice; + program.reset(); + delete pContext; alignedFree(pKernelData); } @@ -85,6 +88,6 @@ class KernelDataTest : public testing::Test { uint32_t kernelUnpaddedSize; std::unique_ptr program; - MockDevice *pDevice; + MockContext *pContext; const KernelInfo *pKernelInfo; }; diff --git a/unit_tests/gtpin/gtpin_tests.cpp b/unit_tests/gtpin/gtpin_tests.cpp index ac65135752..0c875d561c 100644 --- a/unit_tests/gtpin/gtpin_tests.cpp +++ b/unit_tests/gtpin/gtpin_tests.cpp @@ -8,6 +8,7 @@ #include "core/helpers/basic_math.h" #include "core/helpers/file_io.h" #include "core/helpers/hash.h" +#include "runtime/api/api.h" #include "runtime/compiler_interface/patchtokens_decoder.h" #include "runtime/context/context.h" #include "runtime/device/device.h" @@ -813,6 +814,154 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCa EXPECT_EQ(CL_SUCCESS, retVal); } +TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGTPinCallbacksAreCalled) { + gtpinCallbacks.onContextCreate = OnContextCreate; + gtpinCallbacks.onContextDestroy = OnContextDestroy; + gtpinCallbacks.onKernelCreate = OnKernelCreate; + gtpinCallbacks.onKernelSubmit = OnKernelSubmit; + gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate; + gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete; + retFromGtPin = GTPin_Init(>pinCallbacks, &driverServices, nullptr); + EXPECT_EQ(GTPIN_DI_SUCCESS, retFromGtPin); + + cl_kernel kernel1 = nullptr; + cl_kernel kernel2 = nullptr; + cl_program pProgram = nullptr; + cl_device_id device = (cl_device_id)pDevice; + size_t sourceSize = 0; + std::string testFile; + cl_command_queue cmdQ = nullptr; + cl_queue_properties properties = 0; + cl_context context = nullptr; + + KernelBinaryHelper kbHelper("CopyBuffer_simd8", false); + testFile.append(clFiles); + testFile.append("CopyBuffer_simd8.cl"); + auto pSource = loadDataFromFile(testFile.c_str(), sourceSize); + EXPECT_NE(0u, sourceSize); + EXPECT_NE(nullptr, pSource); + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, context); + + cmdQ = clCreateCommandQueue(context, device, properties, &retVal); + ASSERT_NE(nullptr, cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + const char *sources[1] = {pSource.get()}; + pProgram = clCreateProgramWithSource( + context, + 1, + sources, + &sourceSize, + &retVal); + ASSERT_NE(nullptr, pProgram); + + retVal = clBuildProgram( + pProgram, + 1, + &device, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Create and submit first instance of "CopyBuffer" kernel + int prevCount11 = KernelCreateCallbackCount; + kernel1 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount11 + 1, KernelCreateCallbackCount); + + Kernel *pKernel1 = (Kernel *)kernel1; + const KernelInfo &kInfo1 = pKernel1->getKernelInfo(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + EXPECT_EQ(kInfo1.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId1); + + cl_uint workDim = 1; + size_t localWorkSize[3] = {1, 1, 1}; + size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize); + auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel1, 0, sizeof(cl_mem), &buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel1, 1, sizeof(cl_mem), &buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount12 = KernelSubmitCallbackCount; + int prevCount13 = CommandBufferCreateCallbackCount; + int prevCount14 = CommandBufferCompleteCallbackCount; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t workgroupCount[3] = {n, 1, 1}; + retVal = clEnqueueNDRangeKernelINTEL(cmdQ, pKernel1, workDim, globalWorkOffset, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount12 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount13 + 1, CommandBufferCreateCallbackCount); + + // Create and submit second instance of "CopyBuffer" kernel + int prevCount21 = KernelCreateCallbackCount; + kernel2 = clCreateKernel(pProgram, "CopyBuffer", &retVal); + EXPECT_NE(nullptr, kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + // Verify that GT-Pin Kernel Create callback is not called multiple times for the same kernel + EXPECT_EQ(prevCount21, KernelCreateCallbackCount); + + Kernel *pKernel2 = (Kernel *)kernel2; + const KernelInfo &kInfo2 = pKernel2->getKernelInfo(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + EXPECT_EQ(kInfo2.heapInfo.pKernelHeader->ShaderHashCode, gtpinKernelId2); + + auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + auto buff21 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); + + retVal = clSetKernelArg(pKernel2, 0, sizeof(cl_mem), &buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clSetKernelArg(pKernel2, 1, sizeof(cl_mem), &buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + int prevCount22 = KernelSubmitCallbackCount; + int prevCount23 = CommandBufferCreateCallbackCount; + int prevCount24 = CommandBufferCompleteCallbackCount; + retVal = clEnqueueNDRangeKernelINTEL(cmdQ, pKernel2, workDim, globalWorkOffset, workgroupCount, localWorkSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount22 + 1, KernelSubmitCallbackCount); + EXPECT_EQ(prevCount23 + 1, CommandBufferCreateCallbackCount); + + retVal = clFinish(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(prevCount14 + 2, CommandBufferCompleteCallbackCount); + EXPECT_EQ(prevCount24 + 2, CommandBufferCompleteCallbackCount); + + // Cleanup + retVal = clReleaseKernel(kernel1); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseKernel(kernel2); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); + + pSource.reset(); + + retVal = clReleaseMemObject(buff10); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff11); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff20); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseMemObject(buff21); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseCommandQueue(cmdQ); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseContext(context); + EXPECT_EQ(CL_SUCCESS, retVal); +} + TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenKernelCreateCallbacksIsNotCalled) { gtpinCallbacks.onContextCreate = OnContextCreate; gtpinCallbacks.onContextDestroy = OnContextDestroy; diff --git a/unit_tests/program/kernel_data.cpp b/unit_tests/program/kernel_data.cpp index 3c2fefad23..5d0521f135 100644 --- a/unit_tests/program/kernel_data.cpp +++ b/unit_tests/program/kernel_data.cpp @@ -167,6 +167,25 @@ TEST_F(KernelDataTest, WhenMediaVFEStateSlot1TokenIsParsedThenCorrectValuesAreSe EXPECT_EQ_VAL(MediaVFEState.ScratchSpaceOffset, pKernelInfo->patchInfo.mediaVfeStateSlot1->ScratchSpaceOffset); } +TEST_F(KernelDataTest, GivenSyncBufferTokenWhenParsingProgramThenTokenIsFound) { + SPatchAllocateSyncBuffer token; + token.Token = PATCH_TOKEN_ALLOCATE_SYNC_BUFFER; + token.Size = static_cast(sizeof(SPatchAllocateSyncBuffer)); + token.SurfaceStateHeapOffset = 32; + token.DataParamOffset = 1024; + token.DataParamSize = 2; + + pPatchList = &token; + patchListSize = token.Size; + + buildAndDecode(); + + EXPECT_EQ(token.Token, pKernelInfo->patchInfo.pAllocateSyncBuffer->Token); + EXPECT_EQ(token.SurfaceStateHeapOffset, pKernelInfo->patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset); + EXPECT_EQ(token.DataParamOffset, pKernelInfo->patchInfo.pAllocateSyncBuffer->DataParamOffset); + EXPECT_EQ(token.DataParamSize, pKernelInfo->patchInfo.pAllocateSyncBuffer->DataParamSize); +} + TEST_F(KernelDataTest, MediaIDData) { iOpenCL::SPatchInterfaceDescriptorData idData; idData.Token = PATCH_TOKEN_INTERFACE_DESCRIPTOR_DATA;