mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 09:58:55 +08:00
add support for packed simd1 dispatch
Change-Id: I3f2bf8e62e0a38d358fb87f02c88c387c874f6b3 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
2eafa99342
commit
51dcf2b6d2
14
core/helpers/simd_helper.h
Normal file
14
core/helpers/simd_helper.h
Normal file
@@ -0,0 +1,14 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
|
||||
template <typename WALKER_TYPE>
|
||||
constexpr typename WALKER_TYPE::SIMD_SIZE getSimdConfig(uint32_t simdSize) {
|
||||
return static_cast<typename WALKER_TYPE::SIMD_SIZE>((simdSize == 1) ? (32 >> 4) : (simdSize >> 4));
|
||||
}
|
||||
@@ -7,6 +7,7 @@
|
||||
set(NEO_CORE_TESTS_GEN11
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/preamble_tests_gen11.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests_gen11.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_preamble_gen11.cpp
|
||||
)
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_TESTS_GEN11 ${NEO_CORE_TESTS_GEN11})
|
||||
|
||||
16
core/unit_tests/gen11/simd_helper_tests_gen11.cpp
Normal file
16
core/unit_tests/gen11/simd_helper_tests_gen11.cpp
Normal file
@@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "core/unit_tests/helpers/simd_helper_tests.inl"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
using TestSimdConfigSet = ::testing::Test;
|
||||
|
||||
GEN11TEST_F(TestSimdConfigSet, GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturnedGen11) {
|
||||
GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturned<typename FamilyType::GPGPU_WALKER>::TestBodyImpl();
|
||||
}
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
set(NEO_CORE_TESTS_GEN12LP
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests_gen12lp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_preamble_gen12lp.cpp
|
||||
)
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_TESTS_GEN12LP ${NEO_CORE_TESTS_GEN12LP})
|
||||
|
||||
16
core/unit_tests/gen12lp/simd_helper_tests_gen12lp.cpp
Normal file
16
core/unit_tests/gen12lp/simd_helper_tests_gen12lp.cpp
Normal file
@@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "core/unit_tests/helpers/simd_helper_tests.inl"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
using TestSimdConfigSet = ::testing::Test;
|
||||
|
||||
GEN12LPTEST_F(TestSimdConfigSet, GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturnedGen12LP) {
|
||||
GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturned<typename FamilyType::GPGPU_WALKER>::TestBodyImpl();
|
||||
}
|
||||
@@ -7,6 +7,7 @@
|
||||
set(NEO_CORE_TESTS_GEN8
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_preamble_gen8.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests_gen8.cpp
|
||||
)
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_TESTS_GEN8 ${NEO_CORE_TESTS_GEN8})
|
||||
add_subdirectories()
|
||||
16
core/unit_tests/gen8/simd_helper_tests_gen8.cpp
Normal file
16
core/unit_tests/gen8/simd_helper_tests_gen8.cpp
Normal file
@@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "core/unit_tests/helpers/simd_helper_tests.inl"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
using TestSimdConfigSet = ::testing::Test;
|
||||
|
||||
GEN8TEST_F(TestSimdConfigSet, GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturnedGen8) {
|
||||
GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturned<typename FamilyType::GPGPU_WALKER>::TestBodyImpl();
|
||||
}
|
||||
@@ -7,6 +7,7 @@
|
||||
set(NEO_CORE_TESTS_GEN9
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/preamble_tests_gen9.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests_gen9.cpp
|
||||
)
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_TESTS_GEN9 ${NEO_CORE_TESTS_GEN9})
|
||||
add_subdirectories()
|
||||
16
core/unit_tests/gen9/simd_helper_tests_gen9.cpp
Normal file
16
core/unit_tests/gen9/simd_helper_tests_gen9.cpp
Normal file
@@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "core/unit_tests/helpers/simd_helper_tests.inl"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
using TestSimdConfigSet = ::testing::Test;
|
||||
|
||||
GEN9TEST_F(TestSimdConfigSet, GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturnedGen9) {
|
||||
GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturned<typename FamilyType::GPGPU_WALKER>::TestBodyImpl();
|
||||
}
|
||||
@@ -11,6 +11,7 @@ set(NEO_CORE_HELPERS_TESTS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hash_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/memory_leak_listener.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/memory_management.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/string_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/string_to_hash_tests.cpp
|
||||
)
|
||||
|
||||
34
core/unit_tests/helpers/simd_helper_tests.inl
Normal file
34
core/unit_tests/helpers/simd_helper_tests.inl
Normal file
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "core/helpers/simd_helper.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename WALKER_TYPE>
|
||||
class GivenSimdSizeWhenGetSimdConfigCalledThenCorrectEnumReturned {
|
||||
public:
|
||||
static void TestBodyImpl() {
|
||||
uint32_t simd = 32;
|
||||
auto result = getSimdConfig<WALKER_TYPE>(simd);
|
||||
EXPECT_EQ(result, WALKER_TYPE::SIMD_SIZE::SIMD_SIZE_SIMD32);
|
||||
|
||||
simd = 16;
|
||||
result = getSimdConfig<WALKER_TYPE>(simd);
|
||||
EXPECT_EQ(result, WALKER_TYPE::SIMD_SIZE::SIMD_SIZE_SIMD16);
|
||||
|
||||
simd = 8;
|
||||
result = getSimdConfig<WALKER_TYPE>(simd);
|
||||
EXPECT_EQ(result, WALKER_TYPE::SIMD_SIZE::SIMD_SIZE_SIMD8);
|
||||
|
||||
simd = 1;
|
||||
result = getSimdConfig<WALKER_TYPE>(simd);
|
||||
EXPECT_EQ(result, WALKER_TYPE::SIMD_SIZE::SIMD_SIZE_SIMD32);
|
||||
}
|
||||
};
|
||||
} // namespace NEO
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "core/helpers/simd_helper.h"
|
||||
#include "runtime/command_queue/gpgpu_walker_base.inl"
|
||||
#include "runtime/helpers/engine_node_helper.h"
|
||||
|
||||
@@ -43,7 +44,7 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
|
||||
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
|
||||
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
|
||||
walkerCmd->setSimdSize(getSimdConfig<WALKER_TYPE<GfxFamily>>(simd));
|
||||
|
||||
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
|
||||
|
||||
@@ -52,8 +52,10 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
|
||||
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else if (simd == 16) {
|
||||
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else {
|
||||
} else if (simd == 8) {
|
||||
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else {
|
||||
generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,4 +116,23 @@ inline void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16
|
||||
offset += 3 * rowWidth;
|
||||
}
|
||||
}
|
||||
|
||||
void generateLocalIDsForSimdOne(void *b, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder) {
|
||||
uint32_t xDimNum = dimensionsOrder[0];
|
||||
uint32_t yDimNum = dimensionsOrder[1];
|
||||
uint32_t zDimNum = dimensionsOrder[2];
|
||||
|
||||
for (int i = 0; i < localWorkgroupSize[zDimNum]; i++) {
|
||||
for (int j = 0; j < localWorkgroupSize[yDimNum]; j++) {
|
||||
for (int k = 0; k < localWorkgroupSize[xDimNum]; k++) {
|
||||
static_cast<uint16_t *>(b)[0] = k;
|
||||
static_cast<uint16_t *>(b)[1] = j;
|
||||
static_cast<uint16_t *>(b)[2] = i;
|
||||
b = ptrOffset(b, sizeof(GRF));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -34,14 +34,16 @@ inline size_t getThreadsPerWG(uint32_t simd, size_t lws) {
|
||||
? 5
|
||||
: simd == 16
|
||||
? 4
|
||||
: 3;
|
||||
: simd == 8
|
||||
? 3
|
||||
: 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline size_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t numChannels = 3) {
|
||||
auto numGRFSPerThread = getGRFsPerThread(simd);
|
||||
auto returnSize = numChannels * numGRFSPerThread * sizeof(GRF);
|
||||
auto returnSize = numGRFSPerThread * sizeof(GRF) * (simd == 1 ? 1 : numChannels);
|
||||
returnSize = std::max(returnSize, sizeof(GRF));
|
||||
return returnSize;
|
||||
}
|
||||
@@ -68,4 +70,7 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
|
||||
void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
||||
|
||||
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
||||
|
||||
void generateLocalIDsForSimdOne(void *b, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
} // namespace NEO
|
||||
@@ -169,7 +169,7 @@ struct KernelInfo {
|
||||
size_t getBorderColorOffset() const;
|
||||
unsigned int getMaxSimdSize() const {
|
||||
const auto executionEnvironment = patchInfo.executionEnvironment;
|
||||
if (executionEnvironment == nullptr) {
|
||||
if (executionEnvironment == nullptr || executionEnvironment->LargestCompiledSIMDSize == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -124,6 +124,27 @@ HWTEST_F(DispatchWalkerTest, WhenGettingComputeDimensionsThenCorrectNumberOfDime
|
||||
EXPECT_EQ(3u, computeDimensions(workItems3D));
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, givenSimd1WhenSetGpgpuWalkerThreadDataThenSimdInWalkerIsSetTo32Value) {
|
||||
uint32_t pCmdBuffer[1024];
|
||||
MockGraphicsAllocation gfxAllocation((void *)pCmdBuffer, sizeof(pCmdBuffer));
|
||||
LinearStream linearStream(&gfxAllocation);
|
||||
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
WALKER_TYPE *computeWalker = static_cast<WALKER_TYPE *>(linearStream.getSpace(sizeof(WALKER_TYPE)));
|
||||
*computeWalker = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
size_t globalOffsets[] = {0, 0, 0};
|
||||
size_t startWorkGroups[] = {0, 0, 0};
|
||||
size_t numWorkGroups[] = {1, 1, 1};
|
||||
size_t localWorkSizesIn[] = {32, 1, 1};
|
||||
uint32_t simd = 1;
|
||||
iOpenCL::SPatchThreadPayload threadPayload;
|
||||
|
||||
GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(
|
||||
computeWalker, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, threadPayload, 5u);
|
||||
EXPECT_EQ(computeWalker->getSimdSize(), 32 >> 4);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, WhenDispatchingWalkerThenCommandStreamMemoryIsntChanged) {
|
||||
MockKernel kernel(program.get(), kernelInfo, *pDevice);
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
@@ -59,6 +59,12 @@ TEST(LocalID, PerThreadSizeLocalIDs_SIMD32) {
|
||||
EXPECT_EQ(6 * sizeof(GRF), getPerThreadSizeLocalIDs(simd));
|
||||
}
|
||||
|
||||
TEST(LocalID, PerThreadSizeLocalIDs_SIMD1) {
|
||||
uint32_t simd = 1;
|
||||
|
||||
EXPECT_EQ(sizeof(GRF), getPerThreadSizeLocalIDs(simd));
|
||||
}
|
||||
|
||||
struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int, int>> {
|
||||
void SetUp() override {
|
||||
simd = std::get<0>(GetParam());
|
||||
|
||||
@@ -252,3 +252,50 @@ TEST(PerThreadDataTest, generateLocalIDs) {
|
||||
alignedFree(buffer);
|
||||
alignedFree(reference);
|
||||
}
|
||||
|
||||
TEST(PerThreadDataTest, givenSimdEqualOneWhenSetingLocalIdsInPerThreadDataThenIdsAreSetInCorrectOrder) {
|
||||
uint32_t simd = 1;
|
||||
uint32_t numChannels = 3;
|
||||
uint32_t localWorkSize = 24;
|
||||
|
||||
size_t localWorkSizes[3] = {3, 4, 2};
|
||||
|
||||
auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize);
|
||||
|
||||
auto sizeOverSizedBuffer = sizePerThreadDataTotal * 4;
|
||||
auto buffer = static_cast<char *>(alignedMalloc(sizeOverSizedBuffer, 16));
|
||||
memset(buffer, 0, sizeOverSizedBuffer);
|
||||
|
||||
// Setup reference filled with zeros
|
||||
auto reference = static_cast<char *>(alignedMalloc(sizePerThreadDataTotal, 16));
|
||||
memset(reference, 0, sizePerThreadDataTotal);
|
||||
|
||||
LinearStream stream(buffer, sizeOverSizedBuffer / 2);
|
||||
PerThreadDataHelper::sendPerThreadData(
|
||||
stream,
|
||||
simd,
|
||||
numChannels,
|
||||
localWorkSizes,
|
||||
{{0, 1, 2}},
|
||||
false);
|
||||
|
||||
auto bufferPtr = buffer;
|
||||
for (uint16_t i = 0; i < localWorkSizes[2]; i++) {
|
||||
for (uint16_t j = 0; j < localWorkSizes[1]; j++) {
|
||||
for (uint16_t k = 0; k < localWorkSizes[0]; k++) {
|
||||
uint16_t ids[] = {k, j, i};
|
||||
int result = memcmp(bufferPtr, ids, sizeof(uint16_t) * 3);
|
||||
EXPECT_EQ(0, result);
|
||||
bufferPtr += sizeof(GRF);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if buffer overrun happend, only first sizePerThreadDataTotal bytes can be overwriten, following should be same as reference.
|
||||
for (auto i = sizePerThreadDataTotal; i < sizeOverSizedBuffer; i += sizePerThreadDataTotal) {
|
||||
int result = memcmp(buffer + i, reference, sizePerThreadDataTotal);
|
||||
EXPECT_EQ(0, result);
|
||||
}
|
||||
|
||||
alignedFree(buffer);
|
||||
alignedFree(reference);
|
||||
}
|
||||
|
||||
@@ -2163,6 +2163,16 @@ TEST_F(KernelExecutionEnvironmentTest, getMaxSimdReturns1WhenExecutionEnvironmen
|
||||
this->pKernelInfo->patchInfo.executionEnvironment = oldExcEnv;
|
||||
}
|
||||
|
||||
TEST_F(KernelExecutionEnvironmentTest, getMaxSimdReturns1WhenLargestCompilledSimdSizeEqualOne) {
|
||||
|
||||
executionEnvironment.LargestCompiledSIMDSize = 1;
|
||||
|
||||
auto oldExcEnv = this->pKernelInfo->patchInfo.executionEnvironment;
|
||||
|
||||
EXPECT_EQ(1U, this->pKernelInfo->getMaxSimdSize());
|
||||
this->pKernelInfo->patchInfo.executionEnvironment = oldExcEnv;
|
||||
}
|
||||
|
||||
TEST_F(KernelExecutionEnvironmentTest, getMaxRequiredWorkGroupSizeWhenCompiledWorkGroupSizeIsZero) {
|
||||
auto maxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize;
|
||||
auto oldRequiredWorkGroupSizeX = this->pKernelInfo->patchInfo.executionEnvironment->RequiredWorkGroupSizeX;
|
||||
|
||||
Reference in New Issue
Block a user