fix: correct global device synchronization

skip uninitialized command stream receivers

Additionally, add L0 black box for kernel with arg slm

Test verifies new simplified L0 API

Functions:
- zerDriverGetDefaultContext
- zeDeviceSynchronize
- zeCommandListAppendLaunchKernelWithArguments
- zerIdentifierTranslateToDeviceHandle
- zerDeviceTranslateToIdentifier
- zerDriverGetLastErrorDescription

Definitions:
- defaultCommandQueueDesc
- defaultHostMemDesc
- defaultDeviceMemDesc

Related-To: NEO-14560
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2025-07-21 19:51:50 +00:00
committed by Compute-Runtime-Automation
parent 36f10319c4
commit df7e114d54
9 changed files with 274 additions and 18 deletions

View File

@@ -2247,23 +2247,27 @@ uint32_t DeviceImp::getEventMaxKernelCount() const {
ze_result_t DeviceImp::synchronize() {
for (auto &engine : neoDevice->getAllEngines()) {
auto waitStatus = engine.commandStreamReceiver->waitForTaskCountWithKmdNotifyFallback(
engine.commandStreamReceiver->peekTaskCount(),
engine.commandStreamReceiver->obtainCurrentFlushStamp(),
false,
NEO::QueueThrottle::MEDIUM);
if (waitStatus == NEO::WaitStatus::gpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
if (engine.commandStreamReceiver->isInitialized()) {
auto waitStatus = engine.commandStreamReceiver->waitForTaskCountWithKmdNotifyFallback(
engine.commandStreamReceiver->peekTaskCount(),
engine.commandStreamReceiver->obtainCurrentFlushStamp(),
false,
NEO::QueueThrottle::MEDIUM);
if (waitStatus == NEO::WaitStatus::gpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
}
for (auto &secondaryCsr : neoDevice->getSecondaryCsrs()) {
auto waitStatus = secondaryCsr->waitForTaskCountWithKmdNotifyFallback(
secondaryCsr->peekTaskCount(),
secondaryCsr->obtainCurrentFlushStamp(),
false,
NEO::QueueThrottle::MEDIUM);
if (waitStatus == NEO::WaitStatus::gpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
if (secondaryCsr->isInitialized()) {
auto waitStatus = secondaryCsr->waitForTaskCountWithKmdNotifyFallback(
secondaryCsr->peekTaskCount(),
secondaryCsr->obtainCurrentFlushStamp(),
false,
NEO::QueueThrottle::MEDIUM);
if (waitStatus == NEO::WaitStatus::gpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
}

View File

@@ -38,6 +38,7 @@ target_include_directories(${L0_BLACK_BOX_TEST_SHARED_LIB} PUBLIC
set_target_properties(${L0_BLACK_BOX_TEST_SHARED_LIB} PROPERTIES FOLDER ${L0_BLACK_BOX_TEST_PROJECT_FOLDER})
set(TEST_TARGETS
zello_arg_slm
zello_atomic_inc
zello_bindless_kernel
zello_commandlist_immediate

View File

@@ -20,6 +20,11 @@
namespace LevelZeroBlackBoxTests {
decltype(&zerDriverGetDefaultContext) zerDriverGetDefaultContextFunc = nullptr;
decltype(&zeDeviceSynchronize) zeDeviceSynchronizeFunc = nullptr;
decltype(&zeCommandListAppendLaunchKernelWithArguments) zeCommandListAppendLaunchKernelWithArgumentsFunc = nullptr;
decltype(&zerIdentifierTranslateToDeviceHandle) zerIdentifierTranslateToDeviceHandleFunc = nullptr;
decltype(&zerDeviceTranslateToIdentifier) zerDeviceTranslateToIdentifierFunc = nullptr;
decltype(&zerDriverGetLastErrorDescription) zerDriverGetLastErrorDescriptionFunc = nullptr;
struct LoadedDriverExtensions {
std::vector<ze_driver_extension_properties_t> extensions;
@@ -261,7 +266,7 @@ void getErrorMax(int argc, char *argv[]) {
overrideErrorMax = getParamValue(argc, argv, "-em", "--errorMax", 0);
}
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string &blackBoxName, const std::string &currentTest) {
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string_view blackBoxName, const std::string_view currentTest) {
std::cout << std::endl
<< blackBoxName;
if (!currentTest.empty()) {
@@ -280,7 +285,7 @@ void printResult(bool aubMode, bool outputValidationSuccessful, const std::strin
}
}
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string &blackBoxName) {
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string_view blackBoxName) {
std::string currentTest{};
printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
}
@@ -454,6 +459,11 @@ std::vector<ze_device_handle_t> zelloInitContextAndGetDevices(ze_context_handle_
SUCCESS_OR_TERMINATE(zeDriverGet(&driverCount, &driverHandle));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zerDriverGetDefaultContext", reinterpret_cast<void **>(&zerDriverGetDefaultContextFunc)));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zeDeviceSynchronize", reinterpret_cast<void **>(&zeDeviceSynchronizeFunc)));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zeCommandListAppendLaunchKernelWithArguments", reinterpret_cast<void **>(&zeCommandListAppendLaunchKernelWithArgumentsFunc)));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zerIdentifierTranslateToDeviceHandle", reinterpret_cast<void **>(&zerIdentifierTranslateToDeviceHandleFunc)));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zerDeviceTranslateToIdentifier", reinterpret_cast<void **>(&zerDeviceTranslateToIdentifierFunc)));
SUCCESS_OR_TERMINATE(zeDriverGetExtensionFunctionAddress(driverHandle, "zerDriverGetLastErrorDescription", reinterpret_cast<void **>(&zerDriverGetLastErrorDescriptionFunc)));
context = zerDriverGetDefaultContextFunc();
if (!context) {

View File

@@ -23,6 +23,12 @@ namespace LevelZeroBlackBoxTests {
template <bool terminateOnFailure, typename ResulT>
inline void validate(ResulT result, const char *message);
extern decltype(&zerDriverGetDefaultContext) zerDriverGetDefaultContextFunc;
extern decltype(&zeDeviceSynchronize) zeDeviceSynchronizeFunc;
extern decltype(&zeCommandListAppendLaunchKernelWithArguments) zeCommandListAppendLaunchKernelWithArgumentsFunc;
extern decltype(&zerIdentifierTranslateToDeviceHandle) zerIdentifierTranslateToDeviceHandleFunc;
extern decltype(&zerDeviceTranslateToIdentifier) zerDeviceTranslateToIdentifierFunc;
extern decltype(&zerDriverGetLastErrorDescription) zerDriverGetLastErrorDescriptionFunc;
} // namespace LevelZeroBlackBoxTests
#define SUCCESS_OR_TERMINATE(CALL) LevelZeroBlackBoxTests::validate<true>(CALL, #CALL)
@@ -88,9 +94,9 @@ uint32_t getBufferLength(int argc, char *argv[], uint32_t defaultLength);
void getErrorMax(int argc, char *argv[]);
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string &blackBoxName, const std::string &currentTest);
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string_view blackBoxName, const std::string_view currentTest);
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string &blackBoxName);
void printResult(bool aubMode, bool outputValidationSuccessful, const std::string_view blackBoxName);
uint32_t getCommandQueueOrdinal(ze_device_handle_t &device, bool useCooperativeFlag);

View File

@@ -146,6 +146,33 @@ std::vector<uint8_t> compileToNative(const std::string &src, const std::string &
oclocFreeOutput(&numOutputs, &outputs, &ouputLengths, &outputNames);
return ret;
}
const char *slmArgKernelSrc = R"===(
__kernel void test_arg_slm(
__global unsigned int *outputSums, // Output array for sums (global memory)
__local unsigned int *localIdArray, // Local array for local IDs (shared memory)
__local unsigned int *globalIdArray // Local array for global IDs (shared memory)
) {
// Each work-item stores its local and global ID in local memory
localIdArray[get_local_id(0)] = get_local_id(0);
globalIdArray[get_local_id(0)] = get_global_id(0);
// Synchronize all work-items in the group
barrier(CLK_LOCAL_MEM_FENCE);
// Only the first work-item in the group performs the reduction
if(get_local_id(0) == 0){
unsigned int sumLocalIds = 0;
unsigned int sumGlobalIds = 0;
for(int i = 0; i < get_local_size(0); ++i){
sumLocalIds += localIdArray[i];
sumGlobalIds += globalIdArray[i];
}
// Store the results in the output array
outputSums[get_group_id(0)*2] = sumLocalIds;
outputSums[get_group_id(0)*2+1] = sumGlobalIds;
}
}
)===";
const char *memcpyBytesTestKernelSrc = R"===(
kernel void memcpy_bytes(__global char *dst, const __global char *src) {

View File

@@ -19,6 +19,7 @@ std::vector<uint8_t> compileToSpirV(const std::string &src, const std::string &o
std::vector<uint8_t> compileToSpirV(const std::string &src, const std::string &options, const std::string &device, std::string &outCompilerLog);
std::vector<uint8_t> compileToNative(const std::string &src, const std::string &deviceName, const std::string &revisionId, const std::string &options, const std::string &internalOptions, const std::string &statefulMode, std::string &outCompilerLog);
extern const char *slmArgKernelSrc;
extern const char *memcpyBytesTestKernelSrc;
extern const char *memcpyBytesWithPrintfTestKernelSrc;

View File

@@ -46,6 +46,11 @@ _default_config:
params:
- --verbose
zello_arg_slm:
bmg:
dg2:
pvc.b0:
zello_atomic_inc:
dg2:
pvc.b0:

View File

@@ -0,0 +1,155 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "zello_common.h"
#include "zello_compile.h"
#include <cstring>
constexpr std::string_view blackBoxName = "Zello Arg Slm";
void executeKernelAndValidate(ze_context_handle_t context, uint32_t deviceIdentfier,
bool &outputValidationSuccessful) {
if (LevelZeroBlackBoxTests::verbose) {
std::cout << "Testing for device " << deviceIdentfier << std::endl;
}
ze_command_list_handle_t cmdList;
auto device = LevelZeroBlackBoxTests::zerIdentifierTranslateToDeviceHandleFunc(deviceIdentfier);
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device, &defaultCommandQueueDesc, &cmdList));
constexpr ze_group_count_t groupCounts{16, 1, 1};
// Create output buffer
void *dstBuffer = nullptr;
constexpr size_t allocSize = groupCounts.groupCountX * sizeof(uint32_t) * 2;
SUCCESS_OR_TERMINATE(zeMemAllocShared(context, &defaultDeviceMemDesc, &defaultHostMemDesc, allocSize, sizeof(uint32_t), device, &dstBuffer));
std::string buildLog;
auto spirV = LevelZeroBlackBoxTests::compileToSpirV(LevelZeroBlackBoxTests::slmArgKernelSrc, "", buildLog);
LevelZeroBlackBoxTests::printBuildLog(buildLog);
SUCCESS_OR_TERMINATE((0 == spirV.size()));
ze_module_handle_t module = nullptr;
ze_kernel_handle_t kernel = nullptr;
ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC};
ze_module_build_log_handle_t buildlog;
moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
moduleDesc.pInputModule = spirV.data();
moduleDesc.inputSize = spirV.size();
moduleDesc.pBuildFlags = "";
if (zeModuleCreate(context, device, &moduleDesc, &module, &buildlog) != ZE_RESULT_SUCCESS) {
size_t szLog = 0;
zeModuleBuildLogGetString(buildlog, &szLog, nullptr);
std::vector<char> strLog(szLog + 1, 0);
zeModuleBuildLogGetString(buildlog, &szLog, strLog.data());
LevelZeroBlackBoxTests::printBuildLog(strLog.data());
SUCCESS_OR_TERMINATE(zeModuleBuildLogDestroy(buildlog));
std::cerr << std::endl
<< blackBoxName << " Results validation FAILED. Module creation error."
<< std::endl;
SUCCESS_OR_TERMINATE_BOOL(false);
}
SUCCESS_OR_TERMINATE(zeModuleBuildLogDestroy(buildlog));
ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernelDesc.pKernelName = "test_arg_slm";
SUCCESS_OR_TERMINATE(zeKernelCreate(module, &kernelDesc, &kernel));
ze_kernel_properties_t kernProps = {ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES};
SUCCESS_OR_TERMINATE(zeKernelGetProperties(kernel, &kernProps));
LevelZeroBlackBoxTests::printKernelProperties(kernProps, kernelDesc.pKernelName);
for (auto groupSize : {64u, 128u, 256u}) {
// Initialize memory
constexpr uint8_t initValue = 77;
zeCommandListAppendMemoryFill(cmdList, dstBuffer, &initValue, sizeof(initValue), allocSize, nullptr, 0, nullptr);
ze_group_size_t groupSizes{groupSize, 1, 1};
size_t localWorkSizeForUint = groupSizes.groupSizeX * 4u;
const void *kernelArgs[] = {
&dstBuffer, // output buffer
&localWorkSizeForUint, // local buffer for local ids
&localWorkSizeForUint // local buffer for global ids
};
SUCCESS_OR_TERMINATE(LevelZeroBlackBoxTests::zeCommandListAppendLaunchKernelWithArgumentsFunc(cmdList, kernel, groupCounts, groupSizes, kernelArgs, nullptr, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(LevelZeroBlackBoxTests::zeDeviceSynchronizeFunc(device));
// Validate
outputValidationSuccessful = true;
std::vector<uint32_t> expectedOutput(groupCounts.groupCountX * 2, 0);
for (auto i = 0; i < static_cast<int>(groupCounts.groupCountX); ++i) {
auto sumOfLocalIds = groupSize * (groupSize - 1) / 2; // Sum of local IDs from 0 to localWorkSize-1
auto maxGlobalId = groupSize * (i + 1) - 1; // max global id for this group
auto minGlobalId = groupSize * i; // min global id for this group
auto sumOfGlobalIdWithinGroup = (maxGlobalId * (maxGlobalId + 1) / 2) - (minGlobalId * (minGlobalId - 1) / 2); // sum of global ids within this group
expectedOutput[i * 2] = sumOfLocalIds;
expectedOutput[i * 2 + 1] = sumOfGlobalIdWithinGroup;
}
for (auto i = 0; i < static_cast<int>(expectedOutput.size()); ++i) {
auto expectedValue = expectedOutput[i];
auto actualValue = reinterpret_cast<uint32_t *>(dstBuffer)[i];
if (actualValue != expectedValue) {
std::cout << "dstBuffer[" << i << "] = "
<< std::dec << actualValue << " not equal to "
<< expectedValue << "\n";
outputValidationSuccessful = false;
break;
}
}
if (!outputValidationSuccessful) {
break;
}
}
// Cleanup
SUCCESS_OR_TERMINATE(zeMemFree(context, dstBuffer));
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList));
SUCCESS_OR_TERMINATE(zeKernelDestroy(kernel));
SUCCESS_OR_TERMINATE(zeModuleDestroy(module));
}
int main(int argc, char *argv[]) {
LevelZeroBlackBoxTests::verbose = LevelZeroBlackBoxTests::isVerbose(argc, argv);
bool aubMode = LevelZeroBlackBoxTests::isAubMode(argc, argv);
ze_context_handle_t context = nullptr;
const char *errorMsg = nullptr;
auto devices = LevelZeroBlackBoxTests::zelloInitContextAndGetDevices(context);
SUCCESS_OR_TERMINATE(LevelZeroBlackBoxTests::zerDriverGetLastErrorDescriptionFunc(&errorMsg));
if (errorMsg != nullptr && errorMsg[0] != 0) {
std::cerr << "Error initializing context: " << (errorMsg ? errorMsg : "Unknown error") << std::endl;
return 1;
}
uint32_t deviceOrdinal = LevelZeroBlackBoxTests::zerDeviceTranslateToIdentifierFunc(devices[0]);
SUCCESS_OR_TERMINATE(LevelZeroBlackBoxTests::zerDriverGetLastErrorDescriptionFunc(&errorMsg));
if (errorMsg != nullptr && errorMsg[0] != 0) {
std::cerr << "Error zerDeviceTranslateToIdentifier: " << errorMsg << std::endl;
return 1;
}
bool outputValidationSuccessful = false;
executeKernelAndValidate(context, deviceOrdinal, outputValidationSuccessful);
LevelZeroBlackBoxTests::printResult(aubMode, outputValidationSuccessful, blackBoxName);
outputValidationSuccessful = aubMode ? true : outputValidationSuccessful;
return (outputValidationSuccessful ? 0 : 1);
}

View File

@@ -6521,6 +6521,7 @@ TEST_F(DeviceSimpleTests, whenWorkgroupSizeCheckedThenSizeLimitIs1kOrLess) {
HWTEST_F(DeviceSimpleTests, givenGpuHangWhenSynchronizingDeviceThenErrorIsPropagated) {
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
csr.resourcesInitialized = true;
csr.waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::gpuHang;
auto result = zeDeviceSynchronize(device);
@@ -6540,6 +6541,7 @@ HWTEST_F(DeviceSimpleTests, givenNoGpuHangWhenSynchronizingDeviceThenCallWaitFor
csr->flushStamp->setStamp(flushStampToWait++);
csr->captureWaitForTaskCountWithKmdNotifyInputParams = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::ready;
csr->resourcesInitialized = true;
}
auto &secondaryCsrs = neoDevice->getSecondaryCsrs();
@@ -6551,6 +6553,7 @@ HWTEST_F(DeviceSimpleTests, givenNoGpuHangWhenSynchronizingDeviceThenCallWaitFor
csr->flushStamp->setStamp(flushStampToWait++);
csr->captureWaitForTaskCountWithKmdNotifyInputParams = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::ready;
csr->resourcesInitialized = true;
}
auto result = zeDeviceSynchronize(device);
@@ -6575,6 +6578,48 @@ HWTEST_F(DeviceSimpleTests, givenNoGpuHangWhenSynchronizingDeviceThenCallWaitFor
}
}
HWTEST_F(DeviceSimpleTests, whenSynchronizingDeviceThenIgnoreUninitializedCsrs) {
auto &engines = neoDevice->getAllEngines();
TaskCountType taskCountToWait = 1u;
FlushStamp flushStampToWait = 4u;
for (auto &engine : engines) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(engine.commandStreamReceiver);
csr->latestSentTaskCount = 0u;
csr->latestFlushedTaskCount = 0u;
csr->taskCount = taskCountToWait++;
csr->flushStamp->setStamp(flushStampToWait++);
csr->captureWaitForTaskCountWithKmdNotifyInputParams = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::gpuHang;
csr->resourcesInitialized = false;
}
auto &secondaryCsrs = neoDevice->getSecondaryCsrs();
for (auto &secondaryCsr : secondaryCsrs) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(secondaryCsr.get());
csr->latestSentTaskCount = 0u;
csr->latestFlushedTaskCount = 0u;
csr->taskCount = taskCountToWait++;
csr->flushStamp->setStamp(flushStampToWait++);
csr->captureWaitForTaskCountWithKmdNotifyInputParams = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::gpuHang;
csr->resourcesInitialized = false;
}
auto result = zeDeviceSynchronize(device);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
for (auto &engine : engines) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(engine.commandStreamReceiver);
EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyInputParams.size());
}
for (auto &secondaryCsr : secondaryCsrs) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(secondaryCsr.get());
EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyInputParams.size());
}
}
HWTEST_F(DeviceSimpleTests, givenGpuHangOnSecondaryCsrWhenSynchronizingDeviceThenErrorIsPropagated) {
if (neoDevice->getSecondaryCsrs().empty()) {
GTEST_SKIP();
@@ -6583,12 +6628,14 @@ HWTEST_F(DeviceSimpleTests, givenGpuHangOnSecondaryCsrWhenSynchronizingDeviceThe
for (auto &engine : engines) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(engine.commandStreamReceiver);
csr->resourcesInitialized = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::ready;
}
auto &secondaryCsrs = neoDevice->getSecondaryCsrs();
for (auto &secondaryCsr : secondaryCsrs) {
auto csr = static_cast<UltCommandStreamReceiver<FamilyType> *>(secondaryCsr.get());
csr->resourcesInitialized = true;
csr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::gpuHang;
}