From b04a337b3fe2bc1401effe0f729f63c73c58cfcd Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 19 Jul 2022 21:38:23 +0000 Subject: [PATCH] Add copy tracing black box test Signed-off-by: Zbigniew Zdanowicz --- .../core/test/black_box_tests/CMakeLists.txt | 1 + .../black_box_tests/zello_copy_tracing.cpp | 799 ++++++++++++++++++ 2 files changed, 800 insertions(+) create mode 100644 level_zero/core/test/black_box_tests/zello_copy_tracing.cpp diff --git a/level_zero/core/test/black_box_tests/CMakeLists.txt b/level_zero/core/test/black_box_tests/CMakeLists.txt index cba214e0a2..a473fa2fb0 100644 --- a/level_zero/core/test/black_box_tests/CMakeLists.txt +++ b/level_zero/core/test/black_box_tests/CMakeLists.txt @@ -12,6 +12,7 @@ set(TEST_TARGETS zello_copy_image zello_copy_kernel_printf zello_copy_only + zello_copy_tracing zello_debug_info zello_dynamic_link zello_dyn_local_arg diff --git a/level_zero/core/test/black_box_tests/zello_copy_tracing.cpp b/level_zero/core/test/black_box_tests/zello_copy_tracing.cpp new file mode 100644 index 0000000000..b4c1a5adb5 --- /dev/null +++ b/level_zero/core/test/black_box_tests/zello_copy_tracing.cpp @@ -0,0 +1,799 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include +#include +#include + +#include "zello_common.h" + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN64 +#include +#else +#include +#endif + +bool verbose = false; + +struct UserTracerData { + uint32_t tracerData; +}; + +UserTracerData tracerData0 = {}; + +struct UserInstanceData { + std::clock_t startTime; + uint32_t allocCount; +}; + +uint32_t initCount; +uint32_t initPrologCount; +uint32_t initEpilogCount; + +struct TmpInitParams { + ze_init_flag_t flags; +}; + +TmpInitParams initParams; + +void setInitParams(ze_init_flag_t flags) { + initParams.flags = flags; + initCount++; +} + +void checkInitParams(ze_init_params_t *traceParams, TmpInitParams *checkParams) { + SUCCESS_OR_WARNING_BOOL(*(traceParams->pflags) == checkParams->flags); +} + +void onEnterInit( + ze_init_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + checkInitParams(tracerParams, &initParams); + UserInstanceData *instanceData = new UserInstanceData; + instanceData->startTime = clock(); + instanceData->allocCount = initCount; + *tracerInstanceUserData = reinterpret_cast(instanceData); + initPrologCount++; +} + +void onExitInit( + ze_init_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + clock_t endTime = clock(); + SUCCESS_OR_WARNING_BOOL(result == ZE_RESULT_SUCCESS); + UserInstanceData *instanceData = reinterpret_cast(*tracerInstanceUserData); + SUCCESS_OR_WARNING_BOOL(instanceData->allocCount = initCount); + float time = 1000.f * (endTime - instanceData->startTime) / CLOCKS_PER_SEC; + if (verbose) { + std::cout << "zeInit event " << instanceData->allocCount << " " << time << std::endl; + } + delete instanceData; + checkInitParams(tracerParams, &initParams); + initEpilogCount++; +} + +uint32_t driverGetCount; +uint32_t driverGetPrologCount; +uint32_t driverGetEpilogCount; + +struct TmpDriverGetParams { + uint32_t *count; + ze_driver_handle_t *drivers; +}; + +TmpDriverGetParams driverGetParams; + +void setDriverGetParams(uint32_t *count, ze_driver_handle_t *drivers) { + driverGetParams.count = count; + driverGetParams.drivers = drivers; + driverGetCount++; +} + +void checkDriverGetParams(ze_driver_get_params_t *traceParams, TmpDriverGetParams *checkParams) { + SUCCESS_OR_WARNING_BOOL(*(traceParams->ppCount) == checkParams->count); + SUCCESS_OR_WARNING_BOOL(*(traceParams->pphDrivers) == checkParams->drivers); +} + +void onEnterDriverGet( + ze_driver_get_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + checkDriverGetParams(tracerParams, &driverGetParams); + UserInstanceData *instanceData = new UserInstanceData; + instanceData->startTime = clock(); + instanceData->allocCount = initCount; + *tracerInstanceUserData = reinterpret_cast(instanceData); + driverGetPrologCount++; +} + +void onExitDriverGet( + ze_driver_get_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + clock_t endTime = clock(); + SUCCESS_OR_WARNING_BOOL(result == ZE_RESULT_SUCCESS); + UserInstanceData *instanceData = reinterpret_cast(*tracerInstanceUserData); + SUCCESS_OR_WARNING_BOOL(instanceData->allocCount = initCount); + float time = 1000.f * (endTime - instanceData->startTime) / CLOCKS_PER_SEC; + if (verbose) { + std::cout << "zeDriverGet event " << instanceData->allocCount << " " << time << std::endl; + } + delete instanceData; + checkDriverGetParams(tracerParams, &driverGetParams); + driverGetEpilogCount++; +} + +uint32_t memAllocDeviceCount; +uint32_t memAllocDevicePrologCount; +uint32_t memAllocDeviceEpilogCount; + +struct TmpMemAllocDeviceParams { + ze_context_handle_t context; + ze_device_mem_alloc_desc_t *deviceDesc; + size_t size; + size_t alignment; + ze_device_handle_t device; + void *buffer; +}; + +TmpMemAllocDeviceParams allocMemDeviceParams; + +void setMemAllocDeviceParams(ze_context_handle_t context, ze_device_mem_alloc_desc_t *deviceDesc, + size_t allocSize, size_t alignment, ze_device_handle_t device, void *buffer) { + allocMemDeviceParams.context = context; + allocMemDeviceParams.deviceDesc = deviceDesc; + allocMemDeviceParams.size = allocSize; + allocMemDeviceParams.alignment = alignment; + allocMemDeviceParams.device = device; + allocMemDeviceParams.buffer = buffer; + memAllocDeviceCount++; +} + +void checkMemAllocDeviceParams(ze_mem_alloc_device_params_t *traceParams, TmpMemAllocDeviceParams *checkParams) { + SUCCESS_OR_WARNING_BOOL(*(traceParams->phContext) == checkParams->context); + SUCCESS_OR_WARNING_BOOL(*(traceParams->pdevice_desc) == checkParams->deviceDesc); + SUCCESS_OR_WARNING_BOOL(*(traceParams->psize) == checkParams->size); + SUCCESS_OR_WARNING_BOOL(*(traceParams->palignment) == checkParams->alignment); + SUCCESS_OR_WARNING_BOOL(*(traceParams->phDevice) == checkParams->device); + SUCCESS_OR_WARNING_BOOL(*(traceParams->ppptr) == checkParams->buffer); +} + +void onEnterMemAllocDevice( + ze_mem_alloc_device_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + checkMemAllocDeviceParams(tracerParams, &allocMemDeviceParams); + UserInstanceData *instanceData = new UserInstanceData; + instanceData->startTime = clock(); + instanceData->allocCount = memAllocDeviceCount; + *tracerInstanceUserData = reinterpret_cast(instanceData); + memAllocDevicePrologCount++; +} + +void onExitMemAllocDevice( + ze_mem_alloc_device_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + clock_t endTime = clock(); + SUCCESS_OR_WARNING_BOOL(result == ZE_RESULT_SUCCESS); + UserInstanceData *instanceData = reinterpret_cast(*tracerInstanceUserData); + SUCCESS_OR_WARNING_BOOL(instanceData->allocCount == memAllocDeviceCount); + float time = 1000.f * (endTime - instanceData->startTime) / CLOCKS_PER_SEC; + if (verbose) { + std::cout << "zeDriverAllocDeviceMem event " << instanceData->allocCount << " " << time << std::endl; + } + delete instanceData; + checkMemAllocDeviceParams(tracerParams, &allocMemDeviceParams); + memAllocDeviceEpilogCount++; +} + +uint32_t memAllocHostCount; +uint32_t memAllocHostPrologCount; +uint32_t memAllocHostEpilogCount; + +struct TmpMemAllocHostParams { + ze_context_handle_t context; + ze_host_mem_alloc_desc_t *hostDesc; + size_t size; + size_t alignment; + void *buffer; +}; + +TmpMemAllocHostParams memAllocHostParams; + +void setMemAllocHostParams(ze_context_handle_t context, ze_host_mem_alloc_desc_t *hostDesc, + size_t allocSize, size_t alignment, void *buffer) { + memAllocHostParams.context = context; + memAllocHostParams.hostDesc = hostDesc; + memAllocHostParams.size = allocSize; + memAllocHostParams.alignment = alignment; + memAllocHostParams.buffer = buffer; + memAllocHostCount++; +} + +void checkMemAllocHostParams(ze_mem_alloc_host_params_t *traceParams, TmpMemAllocHostParams *checkParams) { + SUCCESS_OR_WARNING_BOOL(*(traceParams->phContext) == checkParams->context); + SUCCESS_OR_WARNING_BOOL(*(traceParams->phost_desc) == checkParams->hostDesc); + SUCCESS_OR_WARNING_BOOL(*(traceParams->psize) == checkParams->size); + SUCCESS_OR_WARNING_BOOL(*(traceParams->palignment) == checkParams->alignment); + SUCCESS_OR_WARNING_BOOL(*(traceParams->ppptr) == checkParams->buffer); +} + +void onEnterMemAllocHost( + ze_mem_alloc_host_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + checkMemAllocHostParams(tracerParams, &memAllocHostParams); + UserInstanceData *instanceData = new UserInstanceData; + instanceData->startTime = clock(); + instanceData->allocCount = memAllocHostCount; + *tracerInstanceUserData = reinterpret_cast(instanceData); + memAllocHostPrologCount++; +} + +void onExitMemAllocHost( + ze_mem_alloc_host_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + clock_t endTime = clock(); + SUCCESS_OR_WARNING_BOOL(result == ZE_RESULT_SUCCESS); + UserInstanceData *instanceData = reinterpret_cast(*tracerInstanceUserData); + SUCCESS_OR_WARNING_BOOL(instanceData->allocCount == memAllocHostCount); + float time = 1000.f * (endTime - instanceData->startTime) / CLOCKS_PER_SEC; + if (verbose) { + std::cout << "zeMemAllocHost event " << instanceData->allocCount << " " << time << std::endl; + } + delete instanceData; + checkMemAllocHostParams(tracerParams, &memAllocHostParams); + memAllocHostEpilogCount++; +} + +uint32_t memAllocSharedCount = 0; +uint32_t memAllocSharedPrologCount = 0; +uint32_t memAllocSharedEpilogCount = 0; + +struct tmpMemAllocSharedParams { + ze_context_handle_t context; + ze_device_mem_alloc_desc_t *deviceDesc; + ze_host_mem_alloc_desc_t *hostDesc; + size_t size; + size_t alignment; + ze_device_handle_t device; + void *buffer; +}; + +tmpMemAllocSharedParams memAllocSharedParams; + +void setMemAllocSharedParams(ze_context_handle_t context, ze_device_mem_alloc_desc_t *deviceDesc, + ze_host_mem_alloc_desc_t *hostDesc, size_t allocSize, size_t alignment, + ze_device_handle_t device, void *buffer) { + memAllocSharedParams.context = context; + memAllocSharedParams.deviceDesc = deviceDesc; + memAllocSharedParams.hostDesc = hostDesc; + memAllocSharedParams.size = allocSize; + memAllocSharedParams.alignment = alignment; + memAllocSharedParams.device = device; + memAllocSharedParams.buffer = buffer; + memAllocSharedCount++; +} + +void checkMemAllocShared(ze_mem_alloc_shared_params_t *traceParams, tmpMemAllocSharedParams *checkParams) { + SUCCESS_OR_WARNING_BOOL(*(traceParams->phContext) == checkParams->context); + SUCCESS_OR_WARNING_BOOL(*(traceParams->pdevice_desc) == checkParams->deviceDesc); + SUCCESS_OR_WARNING_BOOL(*(traceParams->phost_desc) == checkParams->hostDesc); + SUCCESS_OR_WARNING_BOOL(*(traceParams->psize) == checkParams->size); + SUCCESS_OR_WARNING_BOOL(*(traceParams->palignment) == checkParams->alignment); + SUCCESS_OR_WARNING_BOOL(*(traceParams->phDevice) == checkParams->device); + SUCCESS_OR_WARNING_BOOL(*(traceParams->ppptr) == checkParams->buffer); +} + +void onEnterMemAllocShared( + ze_mem_alloc_shared_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + checkMemAllocShared(tracerParams, &memAllocSharedParams); + UserInstanceData *instanceData = new UserInstanceData; + instanceData->startTime = clock(); + instanceData->allocCount = memAllocSharedCount; + *tracerInstanceUserData = reinterpret_cast(instanceData); + memAllocSharedPrologCount++; +} + +void onExitMemAllocShared( + ze_mem_alloc_shared_params_t *tracerParams, + ze_result_t result, + void *traceUserData, + void **tracerInstanceUserData) { + + clock_t endTime = clock(); + SUCCESS_OR_WARNING_BOOL(result == ZE_RESULT_SUCCESS); + UserInstanceData *instanceData = reinterpret_cast(*tracerInstanceUserData); + SUCCESS_OR_WARNING_BOOL(instanceData->allocCount == memAllocSharedCount); + float time = 1000.f * (endTime - instanceData->startTime) / CLOCKS_PER_SEC; + if (verbose) { + std::cout << "zeMemAllocShared event " << instanceData->allocCount << " " << time << std::endl; + } + delete instanceData; + checkMemAllocShared(tracerParams, &memAllocSharedParams); + memAllocSharedEpilogCount++; +} + +void testAppendMemoryCopy0(ze_context_handle_t &context, ze_device_handle_t &device, bool &validRet, + ze_device_dditable_t &deviceDdiTable, + ze_command_queue_dditable_t cmdQueueDdiTable, + ze_command_list_dditable_t &cmdListDdiTable, + ze_mem_dditable_t &memDdiTable) { + const size_t allocSize = 4096 + 7; // +7 to brake alignment and make it harder + char *heapBuffer = new char[allocSize]; + void *zeBuffer = nullptr; + char stackBuffer[allocSize]; + + // Create command queue + uint32_t numQueueGroups = 0; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + std::cout << "No queue groups found!\n"; + std::terminate(); + } + std::vector queueProperties(numQueueGroups); + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, + queueProperties.data())); + + ze_command_queue_handle_t cmdQueue; + ze_command_queue_desc_t cmdQueueDesc = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + 0, + 0, + ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + cmdQueueDesc.ordinal = i; + break; + } + } + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnCreate(context, device, &cmdQueueDesc, &cmdQueue)); + + ze_command_list_handle_t cmdList; + ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, nullptr}; + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnCreate(context, device, &cmdListDesc, &cmdList)); + + ze_device_mem_alloc_desc_t deviceDesc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; + + setMemAllocDeviceParams(context, &deviceDesc, allocSize, allocSize, device, &zeBuffer); + SUCCESS_OR_TERMINATE(memDdiTable.pfnAllocDevice(context, &deviceDesc, allocSize, allocSize, device, &zeBuffer)); + + for (size_t i = 0; i < allocSize; ++i) { + heapBuffer[i] = static_cast(i + 1); + } + memset(stackBuffer, 0, allocSize); + + // Copy from heap to device-allocated memory + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopy(cmdList, zeBuffer, heapBuffer, allocSize, + nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendBarrier(cmdList, nullptr, 0, nullptr)); + + // Copy from device-allocated memory to stack + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopy(cmdList, stackBuffer, zeBuffer, allocSize, + nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnClose(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnExecuteCommandLists(cmdQueue, 1, &cmdList, nullptr)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnSynchronize(cmdQueue, std::numeric_limits::max())); + + // Validate stack and xe buffers have the original data from heapBuffer + validRet = (0 == memcmp(heapBuffer, stackBuffer, allocSize)); + + delete[] heapBuffer; + SUCCESS_OR_TERMINATE(memDdiTable.pfnFree(context, zeBuffer)); + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnDestroy(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnDestroy(cmdQueue)); +} + +void testAppendMemoryCopy1(ze_context_handle_t &context, ze_device_handle_t &device, bool &validRet, + ze_device_dditable_t &deviceDdiTable, + ze_command_queue_dditable_t cmdQueueDdiTable, + ze_command_list_dditable_t cmdListDdiTable, + ze_mem_dditable_t &memDdiTable) { + const size_t allocSize = 4096 + 7; // +7 to brake alignment and make it harder + char *hostBuffer; + void *zeBuffer = nullptr; + char stackBuffer[allocSize]; + + // Create command queue + uint32_t numQueueGroups = 0; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + std::cout << "No queue groups found!\n"; + std::terminate(); + } + std::vector queueProperties(numQueueGroups); + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, + queueProperties.data())); + + ze_command_queue_handle_t cmdQueue; + ze_command_queue_desc_t cmdQueueDesc = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + 0, + 0, + ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + cmdQueueDesc.ordinal = i; + break; + } + } + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnCreate(context, device, &cmdQueueDesc, &cmdQueue)); + + ze_command_list_handle_t cmdList; + ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, nullptr}; + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnCreate(context, device, &cmdListDesc, &cmdList)); + + ze_host_mem_alloc_desc_t hostDesc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC}; + + setMemAllocHostParams(context, &hostDesc, allocSize, 1, (void **)(&hostBuffer)); + SUCCESS_OR_TERMINATE(memDdiTable.pfnAllocHost(context, &hostDesc, allocSize, 1, (void **)(&hostBuffer))); + + ze_device_mem_alloc_desc_t deviceDesc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; + + setMemAllocDeviceParams(context, &deviceDesc, allocSize, allocSize, device, &zeBuffer); + SUCCESS_OR_TERMINATE(memDdiTable.pfnAllocDevice(context, &deviceDesc, allocSize, allocSize, device, &zeBuffer)); + + for (size_t i = 0; i < allocSize; ++i) { + hostBuffer[i] = static_cast(i + 1); + } + memset(stackBuffer, 0, allocSize); + + // Copy from host-allocated to device-allocated memory + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopy(cmdList, zeBuffer, hostBuffer, allocSize, + nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendBarrier(cmdList, nullptr, 0, nullptr)); + + // Copy from device-allocated memory to stack + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopy(cmdList, stackBuffer, zeBuffer, allocSize, + nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnClose(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnExecuteCommandLists(cmdQueue, 1, &cmdList, nullptr)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnSynchronize(cmdQueue, std::numeric_limits::max())); + + // Validate stack and xe buffers have the original data from hostBuffer + validRet = (0 == memcmp(hostBuffer, stackBuffer, allocSize)); + + //delete[] heapBuffer; + SUCCESS_OR_TERMINATE(memDdiTable.pfnFree(context, hostBuffer)); + SUCCESS_OR_TERMINATE(memDdiTable.pfnFree(context, zeBuffer)); + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnDestroy(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnDestroy(cmdQueue)); +} + +void testAppendMemoryCopy2(ze_context_handle_t &context, ze_device_handle_t &device, bool &validRet, + ze_device_dditable_t &deviceDdiTable, + ze_command_queue_dditable_t cmdQueueDdiTable, + ze_command_list_dditable_t cmdListDdiTable, + ze_mem_dditable_t &memDdiTable) { + validRet = true; + + // Create command queue + uint32_t numQueueGroups = 0; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + std::cout << "No queue groups found!\n"; + std::terminate(); + } + std::vector queueProperties(numQueueGroups); + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetCommandQueueGroupProperties(device, &numQueueGroups, + queueProperties.data())); + + ze_command_queue_handle_t cmdQueue; + ze_command_queue_desc_t cmdQueueDesc = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + 0, + 0, + ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + cmdQueueDesc.ordinal = i; + break; + } + } + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnCreate(context, device, &cmdQueueDesc, &cmdQueue)); + + ze_command_list_handle_t cmdList; + ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, nullptr}; + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnCreate(context, device, &cmdListDesc, &cmdList)); + + void *dstBuffer = nullptr; + uint32_t dstWidth = verbose ? 16 : 1024; // width of the dst 2D buffer in bytes + uint32_t dstHeight = verbose ? 32 : 512; // height of the dst 2D buffer in bytes + uint32_t dstOriginX = verbose ? 8 : 128; // Offset in bytes + uint32_t dstOriginY = verbose ? 8 : 144; // Offset in rows + uint32_t dstSize = dstHeight * dstWidth; // Size of the dst buffer + + void *srcBuffer = nullptr; + uint32_t srcWidth = verbose ? 24 : 256; // width of the src 2D buffer in bytes + uint32_t srcHeight = verbose ? 16 : 384; // height of the src 2D buffer in bytes + uint32_t srcOriginX = verbose ? 4 : 64; // Offset in bytes + uint32_t srcOriginY = verbose ? 4 : 128; // Offset in rows + uint32_t srcSize = srcHeight * srcWidth; // Size of the src buffer + + uint32_t width = verbose ? 8 : 144; // width of the region to copy + uint32_t height = verbose ? 12 : 96; // height of the region to copy + const ze_copy_region_t dstRegion = {dstOriginX, dstOriginY, 0, width, height, 0}; + const ze_copy_region_t srcRegion = {srcOriginX, srcOriginY, 0, width, height, 0}; + + ze_device_mem_alloc_desc_t deviceDesc0 = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; + ze_host_mem_alloc_desc_t hostDesc0 = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC}; + setMemAllocSharedParams(context, &deviceDesc0, &hostDesc0, srcSize, 1, device, &srcBuffer); + SUCCESS_OR_TERMINATE(memDdiTable.pfnAllocShared(context, &deviceDesc0, &hostDesc0, srcSize, 1, device, &srcBuffer)); + + ze_device_mem_alloc_desc_t deviceDesc1 = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; + ze_host_mem_alloc_desc_t hostDesc1 = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC}; + setMemAllocSharedParams(context, &deviceDesc1, &hostDesc1, dstSize, 1, device, &dstBuffer); + SUCCESS_OR_TERMINATE(memDdiTable.pfnAllocShared(context, &deviceDesc1, &hostDesc1, dstSize, 1, device, &dstBuffer)); + + // Initialize buffers + uint8_t *stackBuffer = new uint8_t[srcSize]; + for (uint32_t i = 0; i < srcHeight; i++) { + for (uint32_t j = 0; j < srcWidth; j++) { + stackBuffer[i * srcWidth + j] = static_cast(i * srcWidth + j); + } + } + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopy(cmdList, srcBuffer, stackBuffer, srcSize, + nullptr, 0, nullptr)); + + int value = 0; + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryFill(cmdList, dstBuffer, reinterpret_cast(&value), + sizeof(value), dstSize, nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendBarrier(cmdList, nullptr, 0, nullptr)); + + // Perform the copy + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnAppendMemoryCopyRegion(cmdList, dstBuffer, &dstRegion, dstWidth, 0, + const_cast(srcBuffer), &srcRegion, srcWidth, 0, + nullptr, 0, nullptr)); + + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnClose(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnExecuteCommandLists(cmdQueue, 1, &cmdList, nullptr)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnSynchronize(cmdQueue, std::numeric_limits::max())); + + uint8_t *dstBufferChar = reinterpret_cast(dstBuffer); + if (verbose) { + std::cout << "stackBuffer\n"; + for (uint32_t i = 0; i < srcHeight; i++) { + for (uint32_t j = 0; j < srcWidth; j++) { + std::cout << std::setw(3) << std::dec << static_cast(stackBuffer[i * srcWidth + j]) << " "; + } + std::cout << "\n"; + } + + std::cout << "dstBuffer\n"; + for (uint32_t i = 0; i < dstHeight; i++) { + for (uint32_t j = 0; j < dstWidth; j++) { + std::cout << std::setw(3) << std::dec << static_cast(dstBufferChar[i * dstWidth + j]) << " "; + } + std::cout << "\n"; + } + } + + uint32_t dstOffset = dstOriginX + dstOriginY * dstWidth; + uint32_t srcOffset = srcOriginX + srcOriginY * srcWidth; + for (uint32_t i = 0; i < height; i++) { + for (uint32_t j = 0; j < width; j++) { + uint8_t dstVal = dstBufferChar[dstOffset + (i * dstWidth) + j]; + uint8_t srcVal = stackBuffer[srcOffset + (i * srcWidth) + j]; + if (dstVal != srcVal) { + validRet = false; + } + } + } + + delete[] stackBuffer; + SUCCESS_OR_TERMINATE(memDdiTable.pfnFree(context, srcBuffer)); + SUCCESS_OR_TERMINATE(memDdiTable.pfnFree(context, dstBuffer)); + SUCCESS_OR_TERMINATE(cmdListDdiTable.pfnDestroy(cmdList)); + SUCCESS_OR_TERMINATE(cmdQueueDdiTable.pfnDestroy(cmdQueue)); +} + +void setEnvironmentVariable(const char *variableName, const char *variableValue) { +#ifdef _WIN64 + SetEnvironmentVariableA(variableName, variableValue); +#else + setenv(variableName, variableValue, 1); +#endif +} + +int main(int argc, char *argv[]) { + verbose = isVerbose(argc, argv); + + setEnvironmentVariable("ZET_ENABLE_API_TRACING_EXP", "1"); + + ze_api_version_t apiVersion = ZE_API_VERSION_CURRENT; + + ze_global_dditable_t globalDdiTable; + SUCCESS_OR_TERMINATE(zeGetGlobalProcAddrTable(apiVersion, &globalDdiTable)); + + ze_driver_dditable_t driverDdiTable; + SUCCESS_OR_TERMINATE(zeGetDriverProcAddrTable(apiVersion, &driverDdiTable)); + + ze_device_dditable_t deviceDdiTable; + SUCCESS_OR_TERMINATE(zeGetDeviceProcAddrTable(apiVersion, &deviceDdiTable)); + + ze_context_dditable_t contextDdiTable; + SUCCESS_OR_TERMINATE(zeGetContextProcAddrTable(apiVersion, &contextDdiTable)); + + ze_command_queue_dditable_t cmdQueueDdiTable; + SUCCESS_OR_TERMINATE(zeGetCommandQueueProcAddrTable(apiVersion, &cmdQueueDdiTable)); + + ze_command_list_dditable_t cmdListDdiTable; + SUCCESS_OR_TERMINATE(zeGetCommandListProcAddrTable(apiVersion, &cmdListDdiTable)); + + ze_mem_dditable_t memDdiTable; + SUCCESS_OR_TERMINATE(zeGetMemProcAddrTable(apiVersion, &memDdiTable)); + + SUCCESS_OR_TERMINATE(globalDdiTable.pfnInit(ZE_INIT_FLAG_GPU_ONLY)); + + uint32_t driverCount = 0; + SUCCESS_OR_TERMINATE(driverDdiTable.pfnGet(&driverCount, nullptr)); + if (driverCount == 0) + std::terminate(); + + ze_driver_handle_t driver; + driverCount = 1; + SUCCESS_OR_TERMINATE(driverDdiTable.pfnGet(&driverCount, &driver)); + + uint32_t deviceCount = 0; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGet(driver, &deviceCount, nullptr)); + if (deviceCount == 0) + std::terminate(); + + ze_device_handle_t device; + deviceCount = 1; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGet(driver, &deviceCount, &device)); + + ze_context_handle_t context; + ze_context_desc_t contextDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0}; + SUCCESS_OR_TERMINATE(contextDdiTable.pfnCreate(driver, &contextDesc, &context)); + + zet_tracer_exp_desc_t tracerDesc = {ZET_STRUCTURE_TYPE_TRACER_EXP_DESC, nullptr, &tracerData0}; + zet_tracer_exp_handle_t tracer; + SUCCESS_OR_TERMINATE(zetTracerExpCreate(context, &tracerDesc, &tracer)); + + ze_callbacks_t prologCbs = {}; + prologCbs.Global.pfnInitCb = onEnterInit; + prologCbs.Driver.pfnGetCb = onEnterDriverGet; + prologCbs.Mem.pfnAllocDeviceCb = onEnterMemAllocDevice; + prologCbs.Mem.pfnAllocHostCb = onEnterMemAllocHost; + prologCbs.Mem.pfnAllocSharedCb = onEnterMemAllocShared; + SUCCESS_OR_TERMINATE(zetTracerExpSetPrologues(tracer, &prologCbs)); + + ze_callbacks_t epilogCbs = {}; + epilogCbs.Global.pfnInitCb = onExitInit; + epilogCbs.Driver.pfnGetCb = onExitDriverGet; + epilogCbs.Mem.pfnAllocDeviceCb = onExitMemAllocDevice; + epilogCbs.Mem.pfnAllocHostCb = onExitMemAllocHost; + epilogCbs.Mem.pfnAllocSharedCb = onExitMemAllocShared; + SUCCESS_OR_TERMINATE(zetTracerExpSetEpilogues(tracer, &epilogCbs)); + + SUCCESS_OR_TERMINATE(zetTracerExpSetEnabled(tracer, true)); + + setInitParams(ZE_INIT_FLAG_GPU_ONLY); + SUCCESS_OR_TERMINATE(globalDdiTable.pfnInit(ZE_INIT_FLAG_GPU_ONLY)); + + ze_driver_handle_t driverTest; + setDriverGetParams(&driverCount, &driverTest); + SUCCESS_OR_TERMINATE(driverDdiTable.pfnGet(&driverCount, &driverTest)); + + ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + SUCCESS_OR_TERMINATE(deviceDdiTable.pfnGetProperties(device, &deviceProperties)); + printDeviceProperties(deviceProperties); + + bool outputValidationSuccessful; + testAppendMemoryCopy0(context, device, outputValidationSuccessful, + deviceDdiTable, cmdQueueDdiTable, cmdListDdiTable, memDdiTable); + if (outputValidationSuccessful) { + testAppendMemoryCopy1(context, device, outputValidationSuccessful, + deviceDdiTable, cmdQueueDdiTable, cmdListDdiTable, memDdiTable); + } + if (outputValidationSuccessful) { + testAppendMemoryCopy2(context, device, outputValidationSuccessful, + deviceDdiTable, cmdQueueDdiTable, cmdListDdiTable, memDdiTable); + } + + /* tear down tracing environemt and test epilog/prolg counts */ + SUCCESS_OR_TERMINATE(zetTracerExpSetEnabled(tracer, false)); + SUCCESS_OR_TERMINATE(zetTracerExpDestroy(tracer)); + + if (verbose) { + std::cout << "initCount: " << initCount + << " initPrologCount: " << initPrologCount + << " initEpilogCount: " << initEpilogCount + << std::endl; + + std::cout << "driverGetCount: " << driverGetCount + << " driverGetPrologCount: " << driverGetPrologCount + << " driverGetEpilogCount: " << driverGetEpilogCount + << std::endl; + + std::cout << "memAllocDeviceCount: " << memAllocDeviceCount + << " memAllocDevicePrologCount: " << memAllocDevicePrologCount + << " memAllocDeviceEpilogCount: " << memAllocDeviceEpilogCount + << std::endl; + + std::cout << "memAllocHostCount: " << memAllocHostCount + << " memAllocHostPrologCount: " << memAllocHostPrologCount + << " memAllocHostEpilogCount: " << memAllocHostEpilogCount + << std::endl; + + std::cout << "memAllocSharedCount: " << memAllocSharedCount + << " memAllocSharedPrologCount: " << memAllocSharedPrologCount + << " memAllocSharedEpilogCount: " << memAllocSharedEpilogCount + << std::endl; + } + + SUCCESS_OR_TERMINATE_BOOL((initCount == initPrologCount) && + (initCount == initEpilogCount)); + SUCCESS_OR_TERMINATE_BOOL((driverGetCount == driverGetPrologCount) && + (driverGetCount == driverGetEpilogCount)); + SUCCESS_OR_TERMINATE_BOOL((memAllocDeviceCount == memAllocDevicePrologCount) && + (memAllocDeviceCount == memAllocDeviceEpilogCount)); + SUCCESS_OR_TERMINATE_BOOL((memAllocHostCount == memAllocHostPrologCount) && + (memAllocHostCount == memAllocHostEpilogCount)); + SUCCESS_OR_TERMINATE_BOOL((memAllocSharedCount == memAllocSharedPrologCount) && + (memAllocSharedCount == memAllocSharedEpilogCount)); + + SUCCESS_OR_TERMINATE(contextDdiTable.pfnDestroy(context)); + + bool aubMode = isAubMode(argc, argv); + if (aubMode == false) { + std::cout << "\nZello Copy Tracing Results validation " << (outputValidationSuccessful ? "PASSED" : "FAILED") + << std::endl; + } + + int resultOnFailure = aubMode ? 0 : 1; + return outputValidationSuccessful ? 0 : resultOnFailure; +}