feature: R&R support for further appendLaunchKernel variants 2/2

Support:
`zeCommandListAppendLaunchMultipleKernelsIndirect()`

Related-To: NEO-15374
Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
This commit is contained in:
Maciej Bielski
2025-08-06 16:16:45 +00:00
committed by Compute-Runtime-Automation
parent f85e4935e2
commit 81f4b885f1
6 changed files with 253 additions and 6 deletions

View File

@@ -209,6 +209,18 @@ __kernel void increment_by_one(__global uchar *dst, __global uchar *src) {
}
)OpenCLC";
const char *memcpyBytesAndAddConstTestKernelSrc = R"===(
kernel void memcpy_bytes(__global char *dst, const __global char *src) {
unsigned int gid = get_global_id(0);
dst[gid] = src[gid];
}
__kernel void add_constant(global int *values, int addval) {
const int gid = get_global_id(0);
values[gid] = values[gid] + addval;
}
)===";
const char *scratchKernelSrc = R"===(
typedef long16 TYPE;
__attribute__((reqd_work_group_size(32, 1, 1))) // force LWS to 32

View File

@@ -25,6 +25,7 @@ extern const char *memcpyBytesTestKernelSrc;
extern const char *memcpyBytesWithPrintfTestKernelSrc;
extern const char *openCLKernelsSource;
extern const char *memcpyBytesAndAddConstTestKernelSrc;
extern const char *scratchKernelSrc;
extern const char *scratchKernelBuildOptions;

View File

@@ -460,6 +460,175 @@ void testAppendLaunchKernel(ze_driver_handle_t driver,
SUCCESS_OR_TERMINATE(graphApi.executableGraphDestroy(physicalGraph));
}
void testAppendLaunchMultipleKernelsIndirect(ze_driver_handle_t driver,
ze_context_handle_t &context,
ze_device_handle_t &device,
bool &validRet) {
auto graphApi = loadGraphApi(driver);
if (false == graphApi.valid()) {
std::cerr << "Graph API not available" << std::endl;
validRet = false;
return;
}
ze_module_handle_t module;
createModuleFromSpirV(context, device, LevelZeroBlackBoxTests::memcpyBytesAndAddConstTestKernelSrc, module);
ze_kernel_handle_t kernelMemcpySrcToDst;
createKernelWithName(module, "memcpy_bytes", kernelMemcpySrcToDst);
ze_kernel_handle_t kernelAddConstant;
createKernelWithName(module, "add_constant", kernelAddConstant);
ze_event_pool_handle_t eventPool = nullptr;
createEventPool(context, device, eventPool);
ze_event_handle_t eventCopied = nullptr;
createEventHostCoherent(eventPool, eventCopied);
ze_command_list_handle_t cmdList;
createImmediateCmdlistWithMode(context, device, ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, cmdList);
// Buffers
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
void *incrementedBuffer = nullptr;
void *dstBuffer = nullptr;
ze_device_mem_alloc_desc_t devAllocDesc = {
.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
.pNext = nullptr,
.flags = 0,
.ordinal = 0,
};
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &srcBuffer));
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &dstBuffer));
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &incrementedBuffer));
constexpr uint32_t kernelsNum = 2U;
uint32_t *kernelsNumBuff = nullptr;
ze_group_count_t *dispatchTraits = nullptr;
ze_host_mem_alloc_desc_t hostAllocDesc{
.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
.pNext = nullptr,
.flags = 0U,
};
SUCCESS_OR_TERMINATE(zeMemAllocHost(context, &hostAllocDesc, sizeof(uint32_t), 4096, reinterpret_cast<void **>(&kernelsNumBuff)));
SUCCESS_OR_TERMINATE(zeMemAllocHost(context, &hostAllocDesc, sizeof(ze_group_count_t) * kernelsNum, 4096, reinterpret_cast<void **>(&dispatchTraits)));
// Kernel groups size
constexpr size_t bytesPerThread = sizeof(std::byte);
constexpr size_t numThreads = allocSize / bytesPerThread;
uint32_t groupSizeX = 32u;
uint32_t groupSizeY = 1u;
uint32_t groupSizeZ = 1u;
SUCCESS_OR_TERMINATE_BOOL(numThreads % groupSizeX == 0);
SUCCESS_OR_TERMINATE(zeKernelSuggestGroupSize(kernelAddConstant, static_cast<uint32_t>(numThreads), 1U, 1U, &groupSizeX, &groupSizeY, &groupSizeZ));
if (LevelZeroBlackBoxTests::verbose) {
std::cout << "Group size : (" << groupSizeX << ", " << groupSizeY << ", " << groupSizeZ << ")" << std::endl;
}
SUCCESS_OR_TERMINATE(zeKernelSetGroupSize(kernelMemcpySrcToDst, groupSizeX, groupSizeY, groupSizeZ));
SUCCESS_OR_TERMINATE(zeKernelSetGroupSize(kernelAddConstant, groupSizeX, groupSizeY, groupSizeZ));
// Start capturing commands
ze_graph_handle_t virtualGraph = nullptr;
SUCCESS_OR_TERMINATE(graphApi.graphCreate(context, &virtualGraph, nullptr));
SUCCESS_OR_TERMINATE(graphApi.commandListBeginCaptureIntoGraph(cmdList, virtualGraph, nullptr));
// Encode buffers initialization
constexpr std::byte srcInitialValue{0xA};
auto srcInitData = std::vector<std::byte>(allocSize, srcInitialValue);
constexpr std::byte dstInitialValue{0x5};
auto dstInitData = std::vector<std::byte>(allocSize, dstInitialValue);
constexpr int valueToIncrement{-3};
constexpr int deltaValue = 2;
auto incrementedData = std::vector<int>(allocSize / sizeof(int), valueToIncrement);
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, srcBuffer, srcInitData.data(), allocSize, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstBuffer, dstInitData.data(), allocSize, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, incrementedBuffer, incrementedData.data(), allocSize, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(cmdList, nullptr, 0, nullptr));
// Prepare contiguous dispatch traits
for (uint32_t i{0U}; i < kernelsNum; ++i) {
dispatchTraits[i] = {
.groupCountX = static_cast<uint32_t>(numThreads) / groupSizeX,
.groupCountY = 1u,
.groupCountZ = 1u,
};
}
LevelZeroBlackBoxTests::printGroupCount(dispatchTraits[0]);
SUCCESS_OR_TERMINATE_BOOL(dispatchTraits[0].groupCountX * groupSizeX == allocSize);
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelMemcpySrcToDst, 0, sizeof(dstBuffer), &dstBuffer));
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelMemcpySrcToDst, 1, sizeof(srcBuffer), &srcBuffer));
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelAddConstant, 0, sizeof(incrementedBuffer), &incrementedBuffer));
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelAddConstant, 1, sizeof(int), &deltaValue));
ze_kernel_handle_t pKernelHandles[] = {kernelMemcpySrcToDst, kernelAddConstant};
*kernelsNumBuff = kernelsNum;
SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchMultipleKernelsIndirect(cmdList,
kernelsNum,
pKernelHandles,
kernelsNumBuff,
dispatchTraits,
eventCopied,
0U,
nullptr));
// Encode reading data back
auto dstOut = std::vector<std::byte>(allocSize);
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstOut.data(), dstBuffer, allocSize, nullptr, 1, &eventCopied));
auto incrementedOut = std::vector<std::byte>(allocSize);
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, incrementedOut.data(), incrementedBuffer, allocSize, nullptr, 1, &eventCopied));
SUCCESS_OR_TERMINATE(graphApi.commandListEndGraphCapture(cmdList, nullptr, nullptr));
ze_executable_graph_handle_t physicalGraph = nullptr;
SUCCESS_OR_TERMINATE(graphApi.commandListInstantiateGraph(virtualGraph, &physicalGraph, nullptr));
// Dispatch and wait
SUCCESS_OR_TERMINATE(graphApi.commandListAppendGraph(cmdList, physicalGraph, nullptr, nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, -1));
// Validate
auto expectedDst = std::vector<std::byte>(allocSize, srcInitialValue);
validRet = LevelZeroBlackBoxTests::validate(dstOut.data(), expectedDst.data(), allocSize);
if (!validRet) {
std::cerr << "Data mismatches found!\n";
std::cerr << "copiedOutData == " << static_cast<void *>(dstOut.data()) << "\n";
std::cerr << "expectedData == " << static_cast<void *>(expectedDst.data()) << std::endl;
}
constexpr std::byte incrementedValue{0xFF};
auto expectedIncremented = std::vector<std::byte>(allocSize, incrementedValue);
validRet = LevelZeroBlackBoxTests::validate(incrementedOut.data(), expectedIncremented.data(), allocSize);
if (!validRet) {
std::cerr << "Data mismatches found!\n";
std::cerr << "incrementedData == " << static_cast<void *>(incrementedOut.data()) << "\n";
std::cerr << "expectedData == " << static_cast<void *>(expectedIncremented.data()) << std::endl;
}
// Cleanup
SUCCESS_OR_TERMINATE(zeMemFree(context, dispatchTraits));
SUCCESS_OR_TERMINATE(zeMemFree(context, kernelsNumBuff));
SUCCESS_OR_TERMINATE(zeMemFree(context, incrementedBuffer));
SUCCESS_OR_TERMINATE(zeMemFree(context, dstBuffer));
SUCCESS_OR_TERMINATE(zeMemFree(context, srcBuffer));
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList));
SUCCESS_OR_TERMINATE(zeEventDestroy(eventCopied));
SUCCESS_OR_TERMINATE(zeEventPoolDestroy(eventPool));
SUCCESS_OR_TERMINATE(zeKernelDestroy(kernelAddConstant));
SUCCESS_OR_TERMINATE(zeKernelDestroy(kernelMemcpySrcToDst));
SUCCESS_OR_TERMINATE(zeModuleDestroy(module));
SUCCESS_OR_TERMINATE(graphApi.graphDestroy(virtualGraph));
SUCCESS_OR_TERMINATE(graphApi.executableGraphDestroy(physicalGraph));
}
int main(int argc, char *argv[]) {
const std::string blackBoxName("Zello Graph");
LevelZeroBlackBoxTests::verbose = LevelZeroBlackBoxTests::isVerbose(argc, argv);
@@ -494,6 +663,10 @@ int main(int argc, char *argv[]) {
testAppendLaunchKernel(driverHandle, context, device0, true, outputValidationSuccessful);
LevelZeroBlackBoxTests::printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
currentTest = "AppendLaunchMultipleKernelsIndirect";
testAppendLaunchMultipleKernelsIndirect(driverHandle, context, device0, outputValidationSuccessful);
LevelZeroBlackBoxTests::printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
SUCCESS_OR_TERMINATE(zeContextDestroy(context));
int resultOnFailure = aubMode ? 0 : 1;

View File

@@ -395,6 +395,7 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
Mock<CommandList> cmdlist;
Mock<Event> event;
Mock<KernelImp> kernel;
Mock<KernelImp> kernel2;
ze_image_handle_t imgA = nullptr;
ze_image_handle_t imgB = nullptr;
ze_device_handle_t device = nullptr;
@@ -403,7 +404,9 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
ze_event_handle_t eventHandle = &event;
ze_external_semaphore_signal_params_ext_t semSignalParams = {};
ze_external_semaphore_wait_params_ext_t semWaitParams = {};
uint32_t kernelCount = 1;
ze_kernel_handle_t pKernelHandles[] = {&kernel, &kernel2};
const uint32_t numKernels = 2U;
const auto *pCountBuffer = &numKernels;
uint64_t memA[16] = {};
uint64_t memB[16] = {};
@@ -447,14 +450,12 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchCooperativeKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernelIndirect(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
// temporarily unsupported
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, 1, &kernelHandle, &kernelCount, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, numKernels, pKernelHandles, pCountBuffer, &groupCount, nullptr, 0, nullptr));
ze_graph_handle_t hgraph = &graph;
EXPECT_EQ(ZE_RESULT_SUCCESS, ::zeCommandListEndGraphCaptureExp(&cmdlist, &hgraph, nullptr));
ASSERT_EQ(24U, graph.getCapturedCommands().size());
ASSERT_EQ(25U, graph.getCapturedCommands().size());
uint32_t i = 0;
EXPECT_EQ(CaptureApi::zeCommandListAppendBarrier, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
EXPECT_EQ(CaptureApi::zeCommandListAppendMemoryCopy, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
@@ -480,6 +481,7 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchKernel, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchCooperativeKernel, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchKernelIndirect, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchMultipleKernelsIndirect, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
}
TEST(GraphForks, GivenUnknownChildCommandlistThenJoinDoesNothing) {
@@ -723,12 +725,17 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
Mock<Module> module(this->device, nullptr);
Mock<KernelImp> kernel;
kernel.module = &module;
Mock<KernelImp> kernel2;
kernel2.module = &module;
ze_image_handle_t imgA = nullptr;
ze_image_handle_t imgB = nullptr;
zes_device_handle_t device = nullptr;
ze_external_semaphore_ext_handle_t sem = nullptr;
ze_event_handle_t eventHandle = &event;
zet_kernel_handle_t kernelHandle = &kernel;
ze_kernel_handle_t kernelHandle = &kernel;
ze_kernel_handle_t pKernelHandles[] = {&kernel, &kernel2};
const uint32_t numKernels = 2U;
const auto *pCountBuffer = &numKernels;
ze_external_semaphore_signal_params_ext_t semSignalParams = {};
ze_external_semaphore_wait_params_ext_t semWaitParams = {};
@@ -773,6 +780,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchCooperativeKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernelIndirect(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, numKernels, pKernelHandles, pCountBuffer, &groupCount, nullptr, 0, nullptr));
ze_graph_handle_t hgraph = &srcGraph;
EXPECT_EQ(ZE_RESULT_SUCCESS, ::zeCommandListEndGraphCaptureExp(&cmdlist, &hgraph, nullptr));
@@ -804,6 +812,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
EXPECT_EQ(0U, graphHwCommands->appendImageCopyFromMemoryExtCalled);
EXPECT_EQ(0U, graphHwCommands->appendLaunchKernelCalled);
EXPECT_EQ(0U, graphHwCommands->appendLaunchKernelIndirectCalled);
EXPECT_EQ(0U, graphHwCommands->appendLaunchMultipleKernelsIndirectCalled);
execGraph.instantiateFrom(srcGraph);
EXPECT_EQ(1U, graphHwCommands->appendBarrierCalled);
EXPECT_EQ(1U, graphHwCommands->appendMemoryCopyCalled);
@@ -828,6 +837,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
EXPECT_EQ(1U, graphHwCommands->appendImageCopyFromMemoryExtCalled);
EXPECT_EQ(2U, graphHwCommands->appendLaunchKernelCalled); // +1 for zeCommandListAppendLaunchCooperativeKernel
EXPECT_EQ(1U, graphHwCommands->appendLaunchKernelIndirectCalled);
EXPECT_EQ(1U, graphHwCommands->appendLaunchMultipleKernelsIndirectCalled);
}
TEST(GraphExecution, GivenEmptyExecutableGraphWhenSubmittingItToCommandListThenTakeCareOnlyOfEvents) {