mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-09 06:23:01 +08:00
feature: R&R support for further appendLaunchKernel variants 2/2
Support: `zeCommandListAppendLaunchMultipleKernelsIndirect()` Related-To: NEO-15374 Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f85e4935e2
commit
81f4b885f1
@@ -209,6 +209,18 @@ __kernel void increment_by_one(__global uchar *dst, __global uchar *src) {
|
||||
}
|
||||
)OpenCLC";
|
||||
|
||||
const char *memcpyBytesAndAddConstTestKernelSrc = R"===(
|
||||
kernel void memcpy_bytes(__global char *dst, const __global char *src) {
|
||||
unsigned int gid = get_global_id(0);
|
||||
dst[gid] = src[gid];
|
||||
}
|
||||
|
||||
__kernel void add_constant(global int *values, int addval) {
|
||||
const int gid = get_global_id(0);
|
||||
values[gid] = values[gid] + addval;
|
||||
}
|
||||
)===";
|
||||
|
||||
const char *scratchKernelSrc = R"===(
|
||||
typedef long16 TYPE;
|
||||
__attribute__((reqd_work_group_size(32, 1, 1))) // force LWS to 32
|
||||
|
||||
@@ -25,6 +25,7 @@ extern const char *memcpyBytesTestKernelSrc;
|
||||
extern const char *memcpyBytesWithPrintfTestKernelSrc;
|
||||
|
||||
extern const char *openCLKernelsSource;
|
||||
extern const char *memcpyBytesAndAddConstTestKernelSrc;
|
||||
|
||||
extern const char *scratchKernelSrc;
|
||||
extern const char *scratchKernelBuildOptions;
|
||||
|
||||
@@ -460,6 +460,175 @@ void testAppendLaunchKernel(ze_driver_handle_t driver,
|
||||
SUCCESS_OR_TERMINATE(graphApi.executableGraphDestroy(physicalGraph));
|
||||
}
|
||||
|
||||
void testAppendLaunchMultipleKernelsIndirect(ze_driver_handle_t driver,
|
||||
ze_context_handle_t &context,
|
||||
ze_device_handle_t &device,
|
||||
bool &validRet) {
|
||||
auto graphApi = loadGraphApi(driver);
|
||||
if (false == graphApi.valid()) {
|
||||
std::cerr << "Graph API not available" << std::endl;
|
||||
validRet = false;
|
||||
return;
|
||||
}
|
||||
|
||||
ze_module_handle_t module;
|
||||
createModuleFromSpirV(context, device, LevelZeroBlackBoxTests::memcpyBytesAndAddConstTestKernelSrc, module);
|
||||
|
||||
ze_kernel_handle_t kernelMemcpySrcToDst;
|
||||
createKernelWithName(module, "memcpy_bytes", kernelMemcpySrcToDst);
|
||||
ze_kernel_handle_t kernelAddConstant;
|
||||
createKernelWithName(module, "add_constant", kernelAddConstant);
|
||||
|
||||
ze_event_pool_handle_t eventPool = nullptr;
|
||||
createEventPool(context, device, eventPool);
|
||||
|
||||
ze_event_handle_t eventCopied = nullptr;
|
||||
createEventHostCoherent(eventPool, eventCopied);
|
||||
|
||||
ze_command_list_handle_t cmdList;
|
||||
createImmediateCmdlistWithMode(context, device, ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, cmdList);
|
||||
|
||||
// Buffers
|
||||
constexpr size_t allocSize = 4096;
|
||||
void *srcBuffer = nullptr;
|
||||
void *incrementedBuffer = nullptr;
|
||||
void *dstBuffer = nullptr;
|
||||
ze_device_mem_alloc_desc_t devAllocDesc = {
|
||||
.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.ordinal = 0,
|
||||
};
|
||||
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &srcBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &dstBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeMemAllocDevice(context, &devAllocDesc, allocSize, allocSize, device, &incrementedBuffer));
|
||||
|
||||
constexpr uint32_t kernelsNum = 2U;
|
||||
uint32_t *kernelsNumBuff = nullptr;
|
||||
ze_group_count_t *dispatchTraits = nullptr;
|
||||
ze_host_mem_alloc_desc_t hostAllocDesc{
|
||||
.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
|
||||
.pNext = nullptr,
|
||||
.flags = 0U,
|
||||
};
|
||||
SUCCESS_OR_TERMINATE(zeMemAllocHost(context, &hostAllocDesc, sizeof(uint32_t), 4096, reinterpret_cast<void **>(&kernelsNumBuff)));
|
||||
SUCCESS_OR_TERMINATE(zeMemAllocHost(context, &hostAllocDesc, sizeof(ze_group_count_t) * kernelsNum, 4096, reinterpret_cast<void **>(&dispatchTraits)));
|
||||
|
||||
// Kernel groups size
|
||||
constexpr size_t bytesPerThread = sizeof(std::byte);
|
||||
constexpr size_t numThreads = allocSize / bytesPerThread;
|
||||
uint32_t groupSizeX = 32u;
|
||||
uint32_t groupSizeY = 1u;
|
||||
uint32_t groupSizeZ = 1u;
|
||||
SUCCESS_OR_TERMINATE_BOOL(numThreads % groupSizeX == 0);
|
||||
SUCCESS_OR_TERMINATE(zeKernelSuggestGroupSize(kernelAddConstant, static_cast<uint32_t>(numThreads), 1U, 1U, &groupSizeX, &groupSizeY, &groupSizeZ));
|
||||
if (LevelZeroBlackBoxTests::verbose) {
|
||||
std::cout << "Group size : (" << groupSizeX << ", " << groupSizeY << ", " << groupSizeZ << ")" << std::endl;
|
||||
}
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetGroupSize(kernelMemcpySrcToDst, groupSizeX, groupSizeY, groupSizeZ));
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetGroupSize(kernelAddConstant, groupSizeX, groupSizeY, groupSizeZ));
|
||||
|
||||
// Start capturing commands
|
||||
ze_graph_handle_t virtualGraph = nullptr;
|
||||
SUCCESS_OR_TERMINATE(graphApi.graphCreate(context, &virtualGraph, nullptr));
|
||||
SUCCESS_OR_TERMINATE(graphApi.commandListBeginCaptureIntoGraph(cmdList, virtualGraph, nullptr));
|
||||
|
||||
// Encode buffers initialization
|
||||
constexpr std::byte srcInitialValue{0xA};
|
||||
auto srcInitData = std::vector<std::byte>(allocSize, srcInitialValue);
|
||||
constexpr std::byte dstInitialValue{0x5};
|
||||
auto dstInitData = std::vector<std::byte>(allocSize, dstInitialValue);
|
||||
constexpr int valueToIncrement{-3};
|
||||
constexpr int deltaValue = 2;
|
||||
auto incrementedData = std::vector<int>(allocSize / sizeof(int), valueToIncrement);
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, srcBuffer, srcInitData.data(), allocSize, nullptr, 0, nullptr));
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstBuffer, dstInitData.data(), allocSize, nullptr, 0, nullptr));
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, incrementedBuffer, incrementedData.data(), allocSize, nullptr, 0, nullptr));
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(cmdList, nullptr, 0, nullptr));
|
||||
|
||||
// Prepare contiguous dispatch traits
|
||||
for (uint32_t i{0U}; i < kernelsNum; ++i) {
|
||||
dispatchTraits[i] = {
|
||||
.groupCountX = static_cast<uint32_t>(numThreads) / groupSizeX,
|
||||
.groupCountY = 1u,
|
||||
.groupCountZ = 1u,
|
||||
};
|
||||
}
|
||||
|
||||
LevelZeroBlackBoxTests::printGroupCount(dispatchTraits[0]);
|
||||
SUCCESS_OR_TERMINATE_BOOL(dispatchTraits[0].groupCountX * groupSizeX == allocSize);
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelMemcpySrcToDst, 0, sizeof(dstBuffer), &dstBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelMemcpySrcToDst, 1, sizeof(srcBuffer), &srcBuffer));
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelAddConstant, 0, sizeof(incrementedBuffer), &incrementedBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernelAddConstant, 1, sizeof(int), &deltaValue));
|
||||
|
||||
ze_kernel_handle_t pKernelHandles[] = {kernelMemcpySrcToDst, kernelAddConstant};
|
||||
*kernelsNumBuff = kernelsNum;
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchMultipleKernelsIndirect(cmdList,
|
||||
kernelsNum,
|
||||
pKernelHandles,
|
||||
kernelsNumBuff,
|
||||
dispatchTraits,
|
||||
eventCopied,
|
||||
0U,
|
||||
nullptr));
|
||||
|
||||
// Encode reading data back
|
||||
auto dstOut = std::vector<std::byte>(allocSize);
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstOut.data(), dstBuffer, allocSize, nullptr, 1, &eventCopied));
|
||||
|
||||
auto incrementedOut = std::vector<std::byte>(allocSize);
|
||||
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, incrementedOut.data(), incrementedBuffer, allocSize, nullptr, 1, &eventCopied));
|
||||
|
||||
SUCCESS_OR_TERMINATE(graphApi.commandListEndGraphCapture(cmdList, nullptr, nullptr));
|
||||
ze_executable_graph_handle_t physicalGraph = nullptr;
|
||||
SUCCESS_OR_TERMINATE(graphApi.commandListInstantiateGraph(virtualGraph, &physicalGraph, nullptr));
|
||||
|
||||
// Dispatch and wait
|
||||
SUCCESS_OR_TERMINATE(graphApi.commandListAppendGraph(cmdList, physicalGraph, nullptr, nullptr, 0, nullptr));
|
||||
SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, -1));
|
||||
|
||||
// Validate
|
||||
auto expectedDst = std::vector<std::byte>(allocSize, srcInitialValue);
|
||||
validRet = LevelZeroBlackBoxTests::validate(dstOut.data(), expectedDst.data(), allocSize);
|
||||
if (!validRet) {
|
||||
std::cerr << "Data mismatches found!\n";
|
||||
std::cerr << "copiedOutData == " << static_cast<void *>(dstOut.data()) << "\n";
|
||||
std::cerr << "expectedData == " << static_cast<void *>(expectedDst.data()) << std::endl;
|
||||
}
|
||||
|
||||
constexpr std::byte incrementedValue{0xFF};
|
||||
auto expectedIncremented = std::vector<std::byte>(allocSize, incrementedValue);
|
||||
validRet = LevelZeroBlackBoxTests::validate(incrementedOut.data(), expectedIncremented.data(), allocSize);
|
||||
if (!validRet) {
|
||||
std::cerr << "Data mismatches found!\n";
|
||||
std::cerr << "incrementedData == " << static_cast<void *>(incrementedOut.data()) << "\n";
|
||||
std::cerr << "expectedData == " << static_cast<void *>(expectedIncremented.data()) << std::endl;
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
SUCCESS_OR_TERMINATE(zeMemFree(context, dispatchTraits));
|
||||
SUCCESS_OR_TERMINATE(zeMemFree(context, kernelsNumBuff));
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeMemFree(context, incrementedBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeMemFree(context, dstBuffer));
|
||||
SUCCESS_OR_TERMINATE(zeMemFree(context, srcBuffer));
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList));
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeEventDestroy(eventCopied));
|
||||
SUCCESS_OR_TERMINATE(zeEventPoolDestroy(eventPool));
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeKernelDestroy(kernelAddConstant));
|
||||
SUCCESS_OR_TERMINATE(zeKernelDestroy(kernelMemcpySrcToDst));
|
||||
SUCCESS_OR_TERMINATE(zeModuleDestroy(module));
|
||||
|
||||
SUCCESS_OR_TERMINATE(graphApi.graphDestroy(virtualGraph));
|
||||
SUCCESS_OR_TERMINATE(graphApi.executableGraphDestroy(physicalGraph));
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
const std::string blackBoxName("Zello Graph");
|
||||
LevelZeroBlackBoxTests::verbose = LevelZeroBlackBoxTests::isVerbose(argc, argv);
|
||||
@@ -494,6 +663,10 @@ int main(int argc, char *argv[]) {
|
||||
testAppendLaunchKernel(driverHandle, context, device0, true, outputValidationSuccessful);
|
||||
LevelZeroBlackBoxTests::printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
|
||||
|
||||
currentTest = "AppendLaunchMultipleKernelsIndirect";
|
||||
testAppendLaunchMultipleKernelsIndirect(driverHandle, context, device0, outputValidationSuccessful);
|
||||
LevelZeroBlackBoxTests::printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
|
||||
|
||||
SUCCESS_OR_TERMINATE(zeContextDestroy(context));
|
||||
|
||||
int resultOnFailure = aubMode ? 0 : 1;
|
||||
|
||||
@@ -395,6 +395,7 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
|
||||
Mock<CommandList> cmdlist;
|
||||
Mock<Event> event;
|
||||
Mock<KernelImp> kernel;
|
||||
Mock<KernelImp> kernel2;
|
||||
ze_image_handle_t imgA = nullptr;
|
||||
ze_image_handle_t imgB = nullptr;
|
||||
ze_device_handle_t device = nullptr;
|
||||
@@ -403,7 +404,9 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
|
||||
ze_event_handle_t eventHandle = &event;
|
||||
ze_external_semaphore_signal_params_ext_t semSignalParams = {};
|
||||
ze_external_semaphore_wait_params_ext_t semWaitParams = {};
|
||||
uint32_t kernelCount = 1;
|
||||
ze_kernel_handle_t pKernelHandles[] = {&kernel, &kernel2};
|
||||
const uint32_t numKernels = 2U;
|
||||
const auto *pCountBuffer = &numKernels;
|
||||
|
||||
uint64_t memA[16] = {};
|
||||
uint64_t memB[16] = {};
|
||||
@@ -447,14 +450,12 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchCooperativeKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernelIndirect(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
|
||||
// temporarily unsupported
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, 1, &kernelHandle, &kernelCount, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, numKernels, pKernelHandles, pCountBuffer, &groupCount, nullptr, 0, nullptr));
|
||||
|
||||
ze_graph_handle_t hgraph = &graph;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ::zeCommandListEndGraphCaptureExp(&cmdlist, &hgraph, nullptr));
|
||||
|
||||
ASSERT_EQ(24U, graph.getCapturedCommands().size());
|
||||
ASSERT_EQ(25U, graph.getCapturedCommands().size());
|
||||
uint32_t i = 0;
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendBarrier, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendMemoryCopy, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
@@ -480,6 +481,7 @@ TEST(GraphTestApiCapture, GivenCommandListInRecordStateThenCaptureCommandsInstea
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchKernel, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchCooperativeKernel, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchKernelIndirect, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
EXPECT_EQ(CaptureApi::zeCommandListAppendLaunchMultipleKernelsIndirect, static_cast<CaptureApi>(graph.getCapturedCommands()[i++].index()));
|
||||
}
|
||||
|
||||
TEST(GraphForks, GivenUnknownChildCommandlistThenJoinDoesNothing) {
|
||||
@@ -723,12 +725,17 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
|
||||
Mock<Module> module(this->device, nullptr);
|
||||
Mock<KernelImp> kernel;
|
||||
kernel.module = &module;
|
||||
Mock<KernelImp> kernel2;
|
||||
kernel2.module = &module;
|
||||
ze_image_handle_t imgA = nullptr;
|
||||
ze_image_handle_t imgB = nullptr;
|
||||
zes_device_handle_t device = nullptr;
|
||||
ze_external_semaphore_ext_handle_t sem = nullptr;
|
||||
ze_event_handle_t eventHandle = &event;
|
||||
zet_kernel_handle_t kernelHandle = &kernel;
|
||||
ze_kernel_handle_t kernelHandle = &kernel;
|
||||
ze_kernel_handle_t pKernelHandles[] = {&kernel, &kernel2};
|
||||
const uint32_t numKernels = 2U;
|
||||
const auto *pCountBuffer = &numKernels;
|
||||
ze_external_semaphore_signal_params_ext_t semSignalParams = {};
|
||||
ze_external_semaphore_wait_params_ext_t semWaitParams = {};
|
||||
|
||||
@@ -773,6 +780,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchCooperativeKernel(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchKernelIndirect(&cmdlist, kernelHandle, &groupCount, nullptr, 0, nullptr));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, L0::zeCommandListAppendLaunchMultipleKernelsIndirect(&cmdlist, numKernels, pKernelHandles, pCountBuffer, &groupCount, nullptr, 0, nullptr));
|
||||
|
||||
ze_graph_handle_t hgraph = &srcGraph;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ::zeCommandListEndGraphCaptureExp(&cmdlist, &hgraph, nullptr));
|
||||
@@ -804,6 +812,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
|
||||
EXPECT_EQ(0U, graphHwCommands->appendImageCopyFromMemoryExtCalled);
|
||||
EXPECT_EQ(0U, graphHwCommands->appendLaunchKernelCalled);
|
||||
EXPECT_EQ(0U, graphHwCommands->appendLaunchKernelIndirectCalled);
|
||||
EXPECT_EQ(0U, graphHwCommands->appendLaunchMultipleKernelsIndirectCalled);
|
||||
execGraph.instantiateFrom(srcGraph);
|
||||
EXPECT_EQ(1U, graphHwCommands->appendBarrierCalled);
|
||||
EXPECT_EQ(1U, graphHwCommands->appendMemoryCopyCalled);
|
||||
@@ -828,6 +837,7 @@ TEST_F(GraphTestInstantiationFixture, WhenInstantiatingGraphThenBakeCommandsInto
|
||||
EXPECT_EQ(1U, graphHwCommands->appendImageCopyFromMemoryExtCalled);
|
||||
EXPECT_EQ(2U, graphHwCommands->appendLaunchKernelCalled); // +1 for zeCommandListAppendLaunchCooperativeKernel
|
||||
EXPECT_EQ(1U, graphHwCommands->appendLaunchKernelIndirectCalled);
|
||||
EXPECT_EQ(1U, graphHwCommands->appendLaunchMultipleKernelsIndirectCalled);
|
||||
}
|
||||
|
||||
TEST(GraphExecution, GivenEmptyExecutableGraphWhenSubmittingItToCommandListThenTakeCareOnlyOfEvents) {
|
||||
|
||||
Reference in New Issue
Block a user