/* * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include "test.h" #include "runtime/command_queue/local_id_gen.h" #include "runtime/command_stream/linear_stream.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/per_thread_data.h" #include "runtime/program/kernel_info.h" #include "unit_tests/fixtures/device_fixture.h" #include "unit_tests/mocks/mock_graphics_allocation.h" #include "patch_shared.h" using namespace OCLRT; template struct PerThreadDataTests : public DeviceFixture, ::testing::Test { void SetUp() override { DeviceFixture::SetUp(); kernelHeader = {}; kernelHeader.KernelHeapSize = sizeof(kernelIsa); threadPayload = {}; threadPayload.LocalIDXPresent = localIdX ? 1 : 0; threadPayload.LocalIDYPresent = localIdY ? 1 : 0; threadPayload.LocalIDZPresent = localIdZ ? 1 : 0; threadPayload.LocalIDFlattenedPresent = flattenedId; threadPayload.UnusedPerThreadConstantPresent = !(localIdX || localIdY || localIdZ || flattenedId); executionEnvironment = {}; executionEnvironment.CompiledSIMD32 = 1; executionEnvironment.LargestCompiledSIMDSize = 32; kernelInfo.heapInfo.pKernelHeap = kernelIsa; kernelInfo.heapInfo.pKernelHeader = &kernelHeader; kernelInfo.patchInfo.executionEnvironment = &executionEnvironment; kernelInfo.patchInfo.threadPayload = &threadPayload; simd = executionEnvironment.LargestCompiledSIMDSize; numChannels = threadPayload.LocalIDXPresent + threadPayload.LocalIDYPresent + threadPayload.LocalIDZPresent; indirectHeapMemorySize = 4096; indirectHeapMemory = reinterpret_cast(alignedMalloc(indirectHeapMemorySize, 32)); ASSERT_TRUE(isAligned<32>(indirectHeapMemory)); } void TearDown() override { alignedFree(indirectHeapMemory); DeviceFixture::TearDown(); } uint32_t simd; uint32_t numChannels; uint32_t kernelIsa[32]; uint8_t *indirectHeapMemory; size_t indirectHeapMemorySize; SKernelBinaryHeaderCommon kernelHeader; SPatchThreadPayload threadPayload; SPatchExecutionEnvironment executionEnvironment; KernelInfo kernelInfo; }; typedef PerThreadDataTests<> PerThreadDataXYZTests; HWTEST_F(PerThreadDataXYZTests, getLocalIdSizePerThread) { EXPECT_EQ(3 * 2 * sizeof(GRF), PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels)); } HWTEST_F(PerThreadDataXYZTests, getPerThreadDataSizeTotal) { size_t localWorkSize = 256; EXPECT_EQ(256 * 3 * 2 * sizeof(GRF) / 32, PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize)); } HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_256x1x1) { MockGraphicsAllocation gfxAllocation(indirectHeapMemory, indirectHeapMemorySize); LinearStream indirectHeap(&gfxAllocation); size_t localWorkSizes[3] = {256, 1, 1}; auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2]; auto offsetPerThreadData = PerThreadDataHelper::sendPerThreadData( indirectHeap, simd, numChannels, localWorkSizes); auto expectedPerThreadDataSizeTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; EXPECT_EQ(expectedPerThreadDataSizeTotal, sizeConsumed); } HWTEST_F(PerThreadDataXYZTests, sendPerThreadData_2x4x8) { MockGraphicsAllocation gfxAllocation(indirectHeapMemory, indirectHeapMemorySize); LinearStream indirectHeap(&gfxAllocation); const size_t localWorkSizes[3]{2, 4, 8}; auto offsetPerThreadData = PerThreadDataHelper::sendPerThreadData( indirectHeap, simd, numChannels, localWorkSizes); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; EXPECT_EQ(64u * (3u * 2u * 4u * 8u) / 32u, sizeConsumed); } HWTEST_F(PerThreadDataXYZTests, getThreadPayloadSize) { simd = 32; uint32_t size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 2u * 3u, size); simd = 16; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 3u, size); simd = 16; threadPayload.HeaderPresent = 1; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 4u, size); simd = 16; threadPayload.UnusedPerThreadConstantPresent = 1; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 5u, size); } typedef PerThreadDataTests PerThreadDataNoIdsTests; HWTEST_F(PerThreadDataNoIdsTests, givenZeroChannelsWhenPassedTogetLocalIdSizePerThreadThenSizeOfOneGrfIsReturned) { EXPECT_EQ(32u, PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels)); } HWTEST_F(PerThreadDataNoIdsTests, givenZeroChannelsAndHighWkgSizeWhengetPerThreadDataSizeTotalIsCalledThenReturnedSizeContainsUnusedGrfPerEachThread) { size_t localWorkSize = 256u; auto threadCount = localWorkSize / simd; auto expectedSize = threadCount * sizeof(GRF); EXPECT_EQ(expectedSize, PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize)); } HWTEST_F(PerThreadDataNoIdsTests, sendPerThreadDataDoesntSendAnyData) { uint8_t fillValue = 0xcc; memset(indirectHeapMemory, fillValue, indirectHeapMemorySize); MockGraphicsAllocation gfxAllocation(indirectHeapMemory, indirectHeapMemorySize); LinearStream indirectHeap(&gfxAllocation); size_t localWorkSizes[3] = {256, 1, 1}; auto offsetPerThreadData = PerThreadDataHelper::sendPerThreadData( indirectHeap, simd, numChannels, localWorkSizes); size_t sizeConsumed = indirectHeap.getUsed() - offsetPerThreadData; EXPECT_EQ(0u, sizeConsumed); size_t i = 0; while (i < indirectHeapMemorySize) { ASSERT_EQ(fillValue, indirectHeapMemory[i]) << "for index " << i; ++i; } } HWTEST_F(PerThreadDataNoIdsTests, getThreadPayloadSize) { simd = 32; uint32_t size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF), size); simd = 16; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF), size); simd = 16; threadPayload.HeaderPresent = 1; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 2u, size); } typedef PerThreadDataTests PerThreadDataFlattenedIdsTests; HWTEST_F(PerThreadDataFlattenedIdsTests, getThreadPayloadSize) { simd = 32; uint32_t size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 2u, size); simd = 16; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF), size); simd = 16; threadPayload.HeaderPresent = 1; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 2u, size); simd = 32; threadPayload.HeaderPresent = 1; size = PerThreadDataHelper::getThreadPayloadSize(threadPayload, simd); EXPECT_EQ(sizeof(GRF) * 3u, size); } TEST(PerThreadDataTest, generateLocalIDs) { uint32_t simd = 8; uint32_t numChannels = 3; uint32_t localWorkSize = 24; size_t localWorkSizes[3] = {24, 1, 1}; auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, numChannels, localWorkSize); auto sizeOverSizedBuffer = sizePerThreadDataTotal * 4; auto buffer = reinterpret_cast(alignedMalloc(sizeOverSizedBuffer, 16)); memset(buffer, 0, sizeOverSizedBuffer); // Setup reference filled with zeros auto reference = reinterpret_cast(alignedMalloc(sizePerThreadDataTotal, 16)); memset(reference, 0, sizePerThreadDataTotal); LinearStream stream(buffer, sizeOverSizedBuffer / 2); PerThreadDataHelper::sendPerThreadData( stream, simd, numChannels, localWorkSizes); // Check if buffer overrun happend, only first sizePerThreadDataTotal bytes can be overwriten, following should be same as reference. for (auto i = sizePerThreadDataTotal; i < sizeOverSizedBuffer; i += sizePerThreadDataTotal) { int result = memcmp(buffer + i, reference, sizePerThreadDataTotal); EXPECT_EQ(0, result); } alignedFree(buffer); alignedFree(reference); }