Fixing zeKernelGetProperties

Additionally, adding zell_world_jitc_ocloc BB test

Change-Id: If3b8c7e4bd4a789ec1b4f4de55ab21e91c1e7afa
This commit is contained in:
Jaroslaw Chodor 2020-11-02 10:39:17 +01:00 committed by sys_ocldev
parent 75d1ebb811
commit d181a2e5a3
4 changed files with 295 additions and 81 deletions

View File

@ -574,44 +574,23 @@ ze_result_t KernelImp::getKernelName(size_t *pSize, char *pName) {
}
ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) {
pKernelProperties->requiredGroupSizeX = this->groupSize[0];
pKernelProperties->requiredGroupSizeY = this->groupSize[1];
pKernelProperties->requiredGroupSizeZ = this->groupSize[2];
pKernelProperties->numKernelArgs =
static_cast<uint32_t>(this->kernelImmData->getDescriptor().payloadMappings.explicitArgs.size());
ModuleImp *moduleImp = reinterpret_cast<ModuleImp *>(this->module);
NEO::KernelInfo *ki = nullptr;
for (uint32_t i = 0; i < moduleImp->getTranslationUnit()->programInfo.kernelInfos.size(); i++) {
ki = moduleImp->getTranslationUnit()->programInfo.kernelInfos[i];
if (ki->kernelDescriptor.kernelMetadata.kernelName.compare(0, ki->kernelDescriptor.kernelMetadata.kernelName.size(), this->kernelImmData->getDescriptor().kernelMetadata.kernelName) == 0) {
break;
}
}
if (nullptr == ki) {
return ZE_RESULT_ERROR_UNINITIALIZED;
}
pKernelProperties->requiredNumSubGroups = static_cast<uint32_t>(ki->patchInfo.executionEnvironment->CompiledSubGroupsNumber);
pKernelProperties->requiredSubgroupSize = static_cast<uint32_t>(ki->requiredSubGroupSize);
pKernelProperties->maxSubgroupSize = ki->getMaxSimdSize();
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getDevice()->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
uint32_t maxRequiredWorkGroupSize = static_cast<uint32_t>(ki->getMaxRequiredWorkGroupSize(maxKernelWorkGroupSize));
uint32_t largestCompiledSIMDSize = static_cast<uint32_t>(ki->patchInfo.executionEnvironment->LargestCompiledSIMDSize);
pKernelProperties->maxNumSubgroups = static_cast<uint32_t>(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize));
pKernelProperties->localMemSize = static_cast<uint32_t>(moduleImp->getDevice()->getNEODevice()->getDeviceInfo().localMemSize);
pKernelProperties->privateMemSize = ki->patchInfo.pAllocateStatelessPrivateSurface ? ki->patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize
: 0;
pKernelProperties->spillMemSize = ki->patchInfo.mediavfestate ? ki->patchInfo.mediavfestate->PerThreadScratchSpace
: 0;
const auto &kernelDescriptor = this->kernelImmData->getDescriptor();
pKernelProperties->numKernelArgs = static_cast<uint32_t>(kernelDescriptor.payloadMappings.explicitArgs.size());
pKernelProperties->requiredGroupSizeX = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
pKernelProperties->requiredGroupSizeY = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
pKernelProperties->requiredGroupSizeZ = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
pKernelProperties->requiredNumSubGroups = kernelDescriptor.kernelMetadata.compiledSubGroupsNumber;
pKernelProperties->requiredSubgroupSize = kernelDescriptor.kernelMetadata.requiredSubGroupSize;
pKernelProperties->maxSubgroupSize = kernelDescriptor.kernelAttributes.simdSize;
pKernelProperties->localMemSize = kernelDescriptor.kernelAttributes.slmInlineSize;
pKernelProperties->privateMemSize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
pKernelProperties->spillMemSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
memset(pKernelProperties->uuid.kid, 0, ZE_MAX_KERNEL_UUID_SIZE);
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getDevice()->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
return ZE_RESULT_SUCCESS;
}

View File

@ -7,8 +7,9 @@
if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(L0_BLACK_BOX_TEST_PROJECT_FOLDER "ze_intel_gpu/black_box_tests")
set(TEST_TARGETS
zello_world_gpu
zello_timestamp
zello_world_gpu
zello_world_jitc_ocloc
)
foreach(TEST_NAME ${TEST_TARGETS})
@ -18,7 +19,7 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
PROPERTIES
VS_DEBUGGER_COMMAND "$(TargetPath)"
VS_DEBUGGER_COMMAND_ARGUMENTS ""
VS_DEBUGGER_WORKING_DIRECTORY "$(OutputPath)"
VS_DEBUGGER_WORKING_DIRECTORY "${TargetDir}"
)
add_dependencies(${TEST_NAME} ${TARGET_NAME_L0})
@ -26,4 +27,6 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set_target_properties(${TEST_NAME} PROPERTIES FOLDER ${L0_BLACK_BOX_TEST_PROJECT_FOLDER})
endforeach()
target_link_libraries(zello_world_jitc_ocloc PUBLIC ocloc_lib)
endif()

View File

@ -0,0 +1,267 @@
/*
* Copyright (C) 2020 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/offline_compiler/source/ocloc_api.h"
#include <level_zero/ze_api.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <memory>
#include <string>
#include <vector>
#define VALIDATECALL(myZeCall) \
do { \
if (myZeCall != ZE_RESULT_SUCCESS) { \
std::cout << "Error at " \
<< #myZeCall << ": " \
<< __FUNCTION__ << ": " \
<< __LINE__ << "\n"; \
std::terminate(); \
} \
} while (0);
const char *module = R"===(
__kernel void kernel_copy(__global int *dst, __global char *src){
uint gid = get_global_id(0);
dst[gid] = src[gid];
}
)===";
std::vector<uint8_t> compileToSpirV(const std::string &src, const std::string &options, std::string &outCompilerLog) {
std::vector<uint8_t> ret;
const char *mainFileName = "main.cl";
const char *argv[] = {"ocloc", "-q", "-device", "skl", "-file", mainFileName};
const unsigned char *sources[] = {reinterpret_cast<const unsigned char *>(src.c_str())};
size_t sourcesLengths[] = {src.size() + 1};
const char *sourcesNames[] = {mainFileName};
unsigned int numOutputs = 0U;
unsigned char **outputs = nullptr;
size_t *ouputLengths = nullptr;
char **outputNames = nullptr;
int result = oclocInvoke(sizeof(argv) / sizeof(argv[0]), argv,
1, sources, sourcesLengths, sourcesNames,
0, nullptr, nullptr, nullptr,
&numOutputs, &outputs, &ouputLengths, &outputNames);
unsigned char *spirV = nullptr;
size_t spirVlen = 0;
const char *log = nullptr;
size_t logLen = 0;
for (unsigned int i = 0; i < numOutputs; ++i) {
std::string spvExtension = ".spv";
std::string logFileName = "stdout.log";
auto nameLen = strlen(outputNames[i]);
if ((nameLen > spvExtension.size()) && (strstr(&outputNames[i][nameLen - spvExtension.size()], spvExtension.c_str()) != nullptr)) {
spirV = outputs[i];
spirVlen = ouputLengths[i];
} else if ((nameLen >= logFileName.size()) && (strstr(outputNames[i], logFileName.c_str()) != nullptr)) {
log = reinterpret_cast<const char *>(outputs[i]);
logLen = ouputLengths[i];
break;
}
}
if ((result != 0) && (logLen == 0)) {
outCompilerLog = "Unknown error, ocloc returnerd : " + std::to_string(result) + "\n";
return ret;
}
if (logLen != 0) {
outCompilerLog = std::string(log, logLen).c_str();
}
ret.assign(spirV, spirV + spirVlen);
oclocFreeOutput(&numOutputs, &outputs, &ouputLengths, &outputNames);
return ret;
}
int main(int argc, char *argv[]) {
// Initialize driver
VALIDATECALL(zeInit(ZE_INIT_FLAG_GPU_ONLY));
// Retrieve driver
uint32_t driverCount = 0;
VALIDATECALL(zeDriverGet(&driverCount, nullptr));
ze_driver_handle_t driverHandle;
VALIDATECALL(zeDriverGet(&driverCount, &driverHandle));
ze_context_desc_t contextDesc = {};
ze_context_handle_t context;
VALIDATECALL(zeContextCreate(driverHandle, &contextDesc, &context));
// Retrieve device
uint32_t deviceCount = 0;
VALIDATECALL(zeDeviceGet(driverHandle, &deviceCount, nullptr));
ze_device_handle_t device;
deviceCount = 1;
VALIDATECALL(zeDeviceGet(driverHandle, &deviceCount, &device));
// Print some properties
ze_device_properties_t deviceProperties = {};
VALIDATECALL(zeDeviceGetProperties(device, &deviceProperties));
std::cout << "Device : \n"
<< " * name : " << deviceProperties.name << "\n"
<< " * type : " << ((deviceProperties.type == ZE_DEVICE_TYPE_GPU) ? "GPU" : "FPGA") << "\n"
<< " * vendorId : " << std::hex << deviceProperties.vendorId << std::dec << "\n";
// Create command queue
uint32_t numQueueGroups = 0;
VALIDATECALL(zeDeviceGetCommandQueueGroupProperties(device, &numQueueGroups, nullptr));
if (numQueueGroups == 0) {
std::cout << "No queue groups found!\n";
std::terminate();
}
std::vector<ze_command_queue_group_properties_t> queueProperties(numQueueGroups);
VALIDATECALL(zeDeviceGetCommandQueueGroupProperties(device, &numQueueGroups,
queueProperties.data()));
ze_command_queue_handle_t cmdQueue;
ze_command_queue_desc_t cmdQueueDesc = {};
for (uint32_t i = 0; i < numQueueGroups; i++) {
if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
cmdQueueDesc.ordinal = i;
}
}
cmdQueueDesc.index = 0;
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
VALIDATECALL(zeCommandQueueCreate(context, device, &cmdQueueDesc, &cmdQueue));
// Create command list
ze_command_list_handle_t cmdList;
ze_command_list_desc_t cmdListDesc = {};
cmdListDesc.commandQueueGroupOrdinal = cmdQueueDesc.ordinal;
VALIDATECALL(zeCommandListCreate(context, device, &cmdListDesc, &cmdList));
// Create two shared buffers
constexpr size_t allocSize = 4096;
ze_device_mem_alloc_desc_t deviceDesc;
deviceDesc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED;
deviceDesc.ordinal = 0;
ze_host_mem_alloc_desc_t hostDesc;
hostDesc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED;
void *srcBuffer = nullptr;
VALIDATECALL(zeMemAllocShared(context, &deviceDesc, &hostDesc, allocSize, 1, device, &srcBuffer));
void *dstBuffer = nullptr;
VALIDATECALL(zeMemAllocShared(context, &deviceDesc, &hostDesc, allocSize, 1, device, &dstBuffer));
// Initialize memory
constexpr uint8_t val = 55;
memset(srcBuffer, val, allocSize);
memset(dstBuffer, 0, allocSize);
std::string buildLog;
auto spirV = compileToSpirV(module, "", buildLog);
if (buildLog.size() > 0) {
std::cout << "Build log " << buildLog;
}
VALIDATECALL((0 == spirV.size()));
ze_module_handle_t module = nullptr;
ze_kernel_handle_t kernel = nullptr;
ze_module_desc_t moduleDesc = {};
ze_module_build_log_handle_t buildlog;
moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
moduleDesc.pInputModule = spirV.data();
moduleDesc.inputSize = spirV.size();
moduleDesc.pBuildFlags = "";
if (zeModuleCreate(context, device, &moduleDesc, &module, &buildlog) != ZE_RESULT_SUCCESS) {
size_t szLog = 0;
zeModuleBuildLogGetString(buildlog, &szLog, nullptr);
char *strLog = (char *)malloc(szLog);
zeModuleBuildLogGetString(buildlog, &szLog, strLog);
std::cout << "Build log:" << strLog << std::endl;
free(strLog);
}
VALIDATECALL(zeModuleBuildLogDestroy(buildlog));
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = "kernel_copy";
VALIDATECALL(zeKernelCreate(module, &kernelDesc, &kernel));
ze_kernel_properties_t kernProps;
VALIDATECALL(zeKernelGetProperties(kernel, &kernProps));
std::cout << "Kernel : \n"
<< " * name : " << kernelDesc.pKernelName << "\n"
<< " * uuid.mid : " << kernProps.uuid.mid << "\n"
<< " * uuid.kid : " << kernProps.uuid.kid << "\n"
<< " * maxSubgroupSize : " << kernProps.maxSubgroupSize << "\n"
<< " * localMemSize : " << kernProps.localMemSize << "\n"
<< " * spillMemSize : " << kernProps.spillMemSize << "\n"
<< " * privateMemSize : " << kernProps.privateMemSize << "\n"
<< " * maxNumSubgroups : " << kernProps.maxNumSubgroups << "\n"
<< " * numKernelArgs : " << kernProps.numKernelArgs << "\n"
<< " * requiredSubgroupSize : " << kernProps.requiredSubgroupSize << "\n"
<< " * requiredNumSubGroups : " << kernProps.requiredNumSubGroups << "\n"
<< " * requiredGroupSizeX : " << kernProps.requiredGroupSizeX << "\n"
<< " * requiredGroupSizeY : " << kernProps.requiredGroupSizeY << "\n"
<< " * requiredGroupSizeZ : " << kernProps.requiredGroupSizeZ << "\n";
uint32_t groupSizeX = 32u;
uint32_t groupSizeY = 1u;
uint32_t groupSizeZ = 1u;
VALIDATECALL(zeKernelSuggestGroupSize(kernel, allocSize, 1U, 1U, &groupSizeX, &groupSizeY, &groupSizeZ));
VALIDATECALL(zeKernelSetGroupSize(kernel, groupSizeX, groupSizeY, groupSizeZ));
VALIDATECALL(zeKernelSetArgumentValue(kernel, 1, sizeof(dstBuffer), &dstBuffer));
VALIDATECALL(zeKernelSetArgumentValue(kernel, 0, sizeof(srcBuffer), &srcBuffer));
ze_group_count_t dispatchTraits;
dispatchTraits.groupCountX = allocSize / groupSizeX;
dispatchTraits.groupCountY = 1u;
dispatchTraits.groupCountZ = 1u;
VALIDATECALL(zeCommandListAppendLaunchKernel(cmdList, kernel, &dispatchTraits,
nullptr, 0, nullptr));
// Close list and submit for execution
VALIDATECALL(zeCommandListClose(cmdList));
VALIDATECALL(zeCommandQueueExecuteCommandLists(cmdQueue, 1, &cmdList, nullptr));
VALIDATECALL(zeCommandQueueSynchronize(cmdQueue, std::numeric_limits<uint64_t>::max()));
// Validate
bool outputValidationSuccessful = true;
if (memcmp(dstBuffer, srcBuffer, allocSize)) {
outputValidationSuccessful = false;
uint8_t *srcCharBuffer = static_cast<uint8_t *>(srcBuffer);
uint8_t *dstCharBuffer = static_cast<uint8_t *>(dstBuffer);
for (size_t i = 0; i < allocSize; i++) {
if (srcCharBuffer[i] != dstCharBuffer[i]) {
std::cout << "srcBuffer[" << i << "] = " << static_cast<unsigned int>(srcCharBuffer[i]) << " not equal to "
<< "dstBuffer[" << i << "] = " << static_cast<unsigned int>(dstCharBuffer[i]) << "\n";
break;
}
}
}
// Cleanup
VALIDATECALL(zeMemFree(context, dstBuffer));
VALIDATECALL(zeMemFree(context, srcBuffer));
VALIDATECALL(zeCommandListDestroy(cmdList));
VALIDATECALL(zeCommandQueueDestroy(cmdQueue));
VALIDATECALL(zeContextDestroy(context));
std::cout << "\nZello World Results validation " << (outputValidationSuccessful ? "PASSED" : "FAILED") << "\n";
return 0;
}

View File

@ -245,21 +245,6 @@ HWTEST_F(KernelPropertiesTests, givenKernelThenCorrectNameIsRetrieved) {
delete[] kernelNameRetrieved;
}
HWTEST_F(KernelPropertiesTests, givenInvalidKernelThenUnitializedIsReturned) {
ze_kernel_properties_t kernelProperties = {};
std::vector<KernelInfo *> prevKernelInfos;
L0::ModuleImp *moduleImp = reinterpret_cast<L0::ModuleImp *>(module.get());
moduleImp->getTranslationUnit()->programInfo.kernelInfos.swap(prevKernelInfos);
EXPECT_EQ(0u, moduleImp->getTranslationUnit()->programInfo.kernelInfos.size());
ze_result_t res = kernel->getProperties(&kernelProperties);
EXPECT_EQ(ZE_RESULT_ERROR_UNINITIALIZED, res);
prevKernelInfos.swap(moduleImp->getTranslationUnit()->programInfo.kernelInfos);
EXPECT_NE(0u, moduleImp->getTranslationUnit()->programInfo.kernelInfos.size());
}
HWTEST_F(KernelPropertiesTests, whenInitializingThenCalculatesProperPrivateSurfaceSize) {
uint32_t computeUnitsUsedForSratch = 0x300;
@ -297,42 +282,22 @@ HWTEST_F(KernelPropertiesTests, givenValidKernelThenPropertiesAreRetrieved) {
ze_result_t res = kernel->getProperties(&kernelProperties);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
EXPECT_EQ(numKernelArguments, kernelProperties.numKernelArgs);
EXPECT_EQ(6U, kernelProperties.numKernelArgs);
L0::ModuleImp *moduleImp = reinterpret_cast<L0::ModuleImp *>(module.get());
NEO::KernelInfo *ki = nullptr;
for (uint32_t i = 0; i < moduleImp->getTranslationUnit()->programInfo.kernelInfos.size(); i++) {
ki = moduleImp->getTranslationUnit()->programInfo.kernelInfos[i];
if (ki->kernelDescriptor.kernelMetadata.kernelName.compare(0, ki->kernelDescriptor.kernelMetadata.kernelName.size(), kernel->getImmutableData()->getDescriptor().kernelMetadata.kernelName) == 0) {
break;
}
}
EXPECT_EQ(0U, kernelProperties.requiredNumSubGroups);
EXPECT_EQ(0U, kernelProperties.requiredSubgroupSize);
uint32_t requiredNumSubGroups = static_cast<uint32_t>(ki->patchInfo.executionEnvironment->CompiledSubGroupsNumber);
EXPECT_EQ(requiredNumSubGroups, kernelProperties.requiredNumSubGroups);
uint32_t requiredSubgroupSize = static_cast<uint32_t>(ki->requiredSubGroupSize);
EXPECT_EQ(requiredSubgroupSize, kernelProperties.requiredSubgroupSize);
uint32_t maxSubgroupSize = ki->getMaxSimdSize();
uint32_t maxSubgroupSize = this->kernel->getKernelDescriptor().kernelAttributes.simdSize;
ASSERT_NE(0U, maxSubgroupSize);
EXPECT_EQ(maxSubgroupSize, kernelProperties.maxSubgroupSize);
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getDevice()->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
uint32_t maxRequiredWorkGroupSize = static_cast<uint32_t>(ki->getMaxRequiredWorkGroupSize(maxKernelWorkGroupSize));
uint32_t largestCompiledSIMDSize = static_cast<uint32_t>(ki->patchInfo.executionEnvironment->LargestCompiledSIMDSize);
uint32_t maxNumSubgroups = static_cast<uint32_t>(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize));
uint32_t maxNumSubgroups = maxKernelWorkGroupSize / maxSubgroupSize;
EXPECT_EQ(maxNumSubgroups, kernelProperties.maxNumSubgroups);
uint32_t localMemSize = static_cast<uint32_t>(moduleImp->getDevice()->getNEODevice()->getDeviceInfo().localMemSize);
EXPECT_EQ(localMemSize, kernelProperties.localMemSize);
uint32_t privateMemSize = ki->patchInfo.pAllocateStatelessPrivateSurface ? ki->patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize
: 0;
EXPECT_EQ(privateMemSize, kernelProperties.privateMemSize);
uint32_t spillMemSize = ki->patchInfo.mediavfestate ? ki->patchInfo.mediavfestate->PerThreadScratchSpace
: 0;
EXPECT_EQ(spillMemSize, kernelProperties.spillMemSize);
EXPECT_EQ(sizeof(float) * 16U, kernelProperties.localMemSize);
EXPECT_EQ(0U, kernelProperties.privateMemSize);
EXPECT_EQ(0U, kernelProperties.spillMemSize);
uint8_t zeroKid[ZE_MAX_KERNEL_UUID_SIZE];
uint8_t zeroMid[ZE_MAX_MODULE_UUID_SIZE];