Align per thread data size to GRF size

Signed-off-by: Krystian Chmielewski <krystian.chmielewski@intel.com>
2021-07-13 15:29:58 +00:00 · 2021-07-13 15:29:58 +00:00 · 1b2cfbbb1f
parent d18172c00e
commit 1b2cfbbb1f
7 changed files with 52 additions and 7 deletions
--- a/level_zero/core/source/module/module_imp.cpp
+++ b/level_zero/core/source/module/module_imp.cpp
@ -200,6 +200,7 @@ bool ModuleTranslationUnit::processUnpackedBinary() {
    auto blob = ArrayRef<const uint8_t>(reinterpret_cast<const uint8_t *>(this->unpackedDeviceBinary.get()), this->unpackedDeviceBinarySize);
    NEO::SingleDeviceBinary binary = {};
    binary.deviceBinary = blob;
+    binary.targetDevice.grfSize = device->getHwInfo().capabilityTable.grfSize;
    std::string decodeErrors;
    std::string decodeWarnings;

--- a/opencl/source/program/process_device_binary.cpp
+++ b/opencl/source/program/process_device_binary.cpp
@ -146,6 +146,7 @@ cl_int Program::processGenBinary(const ClDevice &clDevice) {
    auto blob = ArrayRef<const uint8_t>(reinterpret_cast<const uint8_t *>(buildInfo.unpackedDeviceBinary.get()), buildInfo.unpackedDeviceBinarySize);
    SingleDeviceBinary binary = {};
    binary.deviceBinary = blob;
+    binary.targetDevice.grfSize = clDevice.getDevice().getHardwareInfo().capabilityTable.grfSize;
    std::string decodeErrors;
    std::string decodeWarnings;

--- a/shared/source/device_binary_format/device_binary_formats.h
+++ b/shared/source/device_binary_format/device_binary_formats.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -58,6 +58,7 @@ struct TargetDevice {
    PRODUCT_FAMILY productFamily = IGFX_UNKNOWN;
    uint32_t stepping = 0U;
    uint32_t maxPointerSizeInBytes = 4U;
+    uint32_t grfSize = 32U;
 };

 struct SingleDeviceBinary {
--- a/shared/source/device_binary_format/zebin_decoder.cpp
+++ b/shared/source/device_binary_format/zebin_decoder.cpp
@ -605,7 +605,7 @@ bool setVecArgIndicesBasedOnSize(CrossThreadDataOffset (&vec)[Len], size_t vecSi
    return true;
 }

-NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Types::Kernel::PerThreadPayloadArgument::PerThreadPayloadArgumentBaseT &src, NEO::KernelDescriptor &dst,
+NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Types::Kernel::PerThreadPayloadArgument::PerThreadPayloadArgumentBaseT &src, NEO::KernelDescriptor &dst, uint32_t grfSize,
                                       std::string &outErrReason, std::string &outWarning) {
    switch (src.argType) {
    default:
@ -620,6 +620,8 @@ NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Type

        uint32_t singleChannelIndicesCount = (dst.kernelAttributes.simdSize == 32 ? 32 : 16);
        uint32_t singleChannelBytes = singleChannelIndicesCount * sizeof(LocalIdT);
+        UNRECOVERABLE_IF(0 == grfSize);
+        singleChannelBytes = alignUp(singleChannelBytes, grfSize);
        auto tupleSize = (src.size / singleChannelBytes);
        switch (tupleSize) {
        default:
@ -634,8 +636,9 @@ NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Type
            break;
        }
        dst.kernelAttributes.perThreadDataSize = dst.kernelAttributes.simdSize;
-        dst.kernelAttributes.perThreadDataSize *= dst.kernelAttributes.numLocalIdChannels;
        dst.kernelAttributes.perThreadDataSize *= sizeof(LocalIdT);
+        dst.kernelAttributes.perThreadDataSize = alignUp(dst.kernelAttributes.perThreadDataSize, grfSize);
+        dst.kernelAttributes.perThreadDataSize *= dst.kernelAttributes.numLocalIdChannels;
        break;
    }
    case NEO::Elf::ZebinKernelMetadata::Types::Kernel::ArgTypePackedLocalIds: {
@ -956,7 +959,7 @@ NEO::DecodeError populateKernelDescriptor(NEO::ProgramInfo &dst, NEO::Elf::Elf<N
    }

    for (const auto &arg : perThreadPayloadArguments) {
-        auto decodeErr = populateArgDescriptor(arg, kernelDescriptor, outErrReason, outWarning);
+        auto decodeErr = populateArgDescriptor(arg, kernelDescriptor, dst.grfSize, outErrReason, outWarning);
        if (DecodeError::Success != decodeErr) {
            return decodeErr;
        }
@ -1130,6 +1133,7 @@ DecodeError decodeSingleDeviceBinary<NEO::DeviceBinaryFormat::Zebin>(ProgramInfo
    }

    dst.decodedElf = elf;
+    dst.grfSize = src.targetDevice.grfSize;

    if (false == zebinSections.globalDataSections.empty()) {
        dst.globalVariables.initData = zebinSections.globalDataSections[0]->data.begin();
--- a/shared/source/device_binary_format/zebin_decoder.h
+++ b/shared/source/device_binary_format/zebin_decoder.h
@ -95,7 +95,7 @@ DecodeError readZeInfoPerThreadMemoryBuffers(const NEO::Yaml::YamlParser &parser
                                             ConstStringRef context,
                                             std::string &outErrReason, std::string &outWarning);

-NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Types::Kernel::PerThreadPayloadArgument::PerThreadPayloadArgumentBaseT &src, NEO::KernelDescriptor &dst,
+NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Types::Kernel::PerThreadPayloadArgument::PerThreadPayloadArgumentBaseT &src, NEO::KernelDescriptor &dst, const uint32_t grfSize,
                                       std::string &outErrReason, std::string &outWarning);

 NEO::DecodeError populateArgDescriptor(const NEO::Elf::ZebinKernelMetadata::Types::Kernel::PayloadArgument::PayloadArgumentBaseT &src, NEO::KernelDescriptor &dst, uint32_t &crossThreadDataSize,
--- a/shared/source/program/program_info.h
+++ b/shared/source/program/program_info.h
@ -41,6 +41,7 @@ struct ProgramInfo {

    std::vector<KernelInfo *> kernelInfos;
    Elf::Elf<Elf::EI_CLASS_64> decodedElf;
+    uint32_t grfSize = 32U;
 };

 size_t getMaxInlineSlmNeeded(const ProgramInfo &programInfo);
--- a/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp
+++ b/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp
@ -2520,9 +2520,9 @@ TEST(PopulateKernelDescriptor, GivenInvalidPerThreadArgThenFails) {
    NEO::ConstStringRef zeinfo = R"===(
 kernels:
    - name : some_kernel
-      execution_env:   
+      execution_env:
        simd_size: 8
-      per_thread_payload_arguments: 
+      per_thread_payload_arguments:
        - arg_type:        local_size
          offset:          0
          size:            8
@ -2544,6 +2544,43 @@ kernels:
    EXPECT_STREQ("DeviceBinaryFormat::Zebin : Invalid arg type in per-thread data section in context of : some_kernel.\n", decodeErrors.c_str());
 }

+TEST(PopulateKernelDescriptor, GivenValidLocalIdThenAlignUpChannelSizeToGrfSize) {
+    NEO::ConstStringRef zeinfo = R"===(
+kernels:
+    - name : some_kernel
+      execution_env:   
+        simd_size: 16
+      per_thread_payload_arguments: 
+        - arg_type:        local_id
+          offset:          0
+          size:            192
+)===";
+    NEO::ProgramInfo programInfo;
+    programInfo.grfSize = 64;
+    ZebinTestData::ValidEmptyProgram zebin;
+    zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Elf::SectionsNamesZebin::textPrefix.str() + "some_kernel", {});
+    std::string errors, warnings;
+    auto elf = NEO::Elf::decodeElf(zebin.storage, errors, warnings);
+    ASSERT_NE(nullptr, elf.elfFileHeader) << errors << " " << warnings;
+
+    NEO::Yaml::YamlParser parser;
+    bool parseSuccess = parser.parse(zeinfo, errors, warnings);
+    ASSERT_TRUE(parseSuccess) << errors << " " << warnings;
+
+    NEO::ZebinSections zebinSections;
+    auto extractErr = NEO::extractZebinSections(elf, zebinSections, errors, warnings);
+    ASSERT_EQ(NEO::DecodeError::Success, extractErr) << errors << " " << warnings;
+
+    auto &kernelNode = *parser.createChildrenRange(*parser.findNodeWithKeyDfs("kernels")).begin();
+    auto err = NEO::populateKernelDescriptor(programInfo, elf, zebinSections, parser, kernelNode, errors, warnings);
+    EXPECT_EQ(NEO::DecodeError::Success, err);
+    EXPECT_TRUE(errors.empty()) << errors;
+    EXPECT_TRUE(warnings.empty()) << warnings;
+    ASSERT_EQ(1U, programInfo.kernelInfos.size());
+    EXPECT_EQ(3U, programInfo.kernelInfos[0]->kernelDescriptor.kernelAttributes.numLocalIdChannels);
+    EXPECT_EQ(192U, programInfo.kernelInfos[0]->kernelDescriptor.kernelAttributes.perThreadDataSize);
+}
+
 TEST(PopulateKernelDescriptor, GivenValidPerThreadArgThenPopulatesKernelDescriptor) {
    NEO::ConstStringRef zeinfo = R"===(
 kernels: