diff --git a/runtime/built_ins/built_ins.inl b/runtime/built_ins/built_ins.inl index 982564d6aa..3a70c3869d 100644 --- a/runtime/built_ins/built_ins.inl +++ b/runtime/built_ins/built_ins.inl @@ -33,7 +33,7 @@ bool BuiltInOp::buildDispatchInfos(MultiD for (auto &memObj : *operationParams.memObjsForAuxTranslation) { DispatchInfoBuilder builder; auto graphicsAllocation = memObj->getGraphicsAllocation(); - size_t allocationSize = alignUp(memObj->getSize(), 4); + size_t allocationSize = alignUp(memObj->getSize(), 512); if (AuxTranslationDirection::AuxToNonAux == operationParams.auxTranslationDirection) { builder.setKernel(convertToNonAuxKernel.at(kernelInstanceNumber++).get()); @@ -46,7 +46,7 @@ bool BuiltInOp::buildDispatchInfos(MultiD builder.setArg(1, memObj); } - size_t xGws = allocationSize / 4; + size_t xGws = allocationSize / 16; builder.setDispatchGeometry(Vec3{xGws, 0, 0}, Vec3{0, 0, 0}, Vec3{0, 0, 0}); builder.bake(multiDispatchInfo); @@ -62,8 +62,9 @@ void BuiltInOp::resizeKernelInstances(siz for (size_t i = convertToNonAuxKernel.size(); i < size; i++) { auto clonedKernel1 = Kernel::create(baseKernel->getProgram(), baseKernel->getKernelInfo(), nullptr); - clonedKernel1->setDisableL3forStatefulBuffers(true); + clonedKernel1->setAuxTranslationFlag(true); auto clonedKernel2 = Kernel::create(baseKernel->getProgram(), baseKernel->getKernelInfo(), nullptr); + clonedKernel2->setAuxTranslationFlag(true); clonedKernel1->cloneKernel(baseKernel); clonedKernel2->cloneKernel(baseKernel); diff --git a/runtime/built_ins/kernels/aux_translation.igdrcl_built_in b/runtime/built_ins/kernels/aux_translation.igdrcl_built_in index 335351f044..78ffa89eea 100644 --- a/runtime/built_ins/kernels/aux_translation.igdrcl_built_in +++ b/runtime/built_ins/kernels/aux_translation.igdrcl_built_in @@ -8,6 +8,7 @@ R"===( __kernel void fullCopy(__global const uint* src, __global uint* dst) { unsigned int gid = get_global_id(0); - dst[gid] = src[gid]; + uint4 loaded = vload4(gid, src); + vstore4(loaded, gid, dst); } )===" diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp index 6c57b0ad72..45e7fe2296 100644 --- a/runtime/kernel/kernel.cpp +++ b/runtime/kernel/kernel.cpp @@ -1143,7 +1143,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, if (requiresSshForBuffers()) { auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap); - buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3forStatefulBuffers); + buffer->setArgStateful(surfaceState, forceNonAuxMode, auxTranslationKernel); kernelArguments[argIndex].isUncacheable = buffer->isMemObjUncacheable(); } addAllocationToCacheFlushVector(argIndex, buffer->getGraphicsAllocation()); diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index da51f6314a..0b3a8968b3 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -385,8 +385,8 @@ class Kernel : public BaseObject<_cl_kernel> { using CacheFlushAllocationsVec = StackVec; void getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const; - void setDisableL3forStatefulBuffers(bool disableL3forStatefulBuffers) { - this->disableL3forStatefulBuffers = disableL3forStatefulBuffers; + void setAuxTranslationFlag(bool auxTranslationFlag) { + this->auxTranslationKernel = auxTranslationFlag; } protected: @@ -487,7 +487,7 @@ class Kernel : public BaseObject<_cl_kernel> { std::vector kernelArgHandlers; std::vector kernelSvmGfxAllocations; - bool disableL3forStatefulBuffers = false; + bool auxTranslationKernel = false; size_t numberOfBindingTableStates; size_t localBindingTableOffset; diff --git a/runtime/mem_obj/buffer.h b/runtime/mem_obj/buffer.h index 338aa89c9a..33efba4fda 100644 --- a/runtime/mem_obj/buffer.h +++ b/runtime/mem_obj/buffer.h @@ -105,7 +105,7 @@ class Buffer : public MemObj { bool isValidSubBufferOffset(size_t offset); uint64_t setArgStateless(void *memory, uint32_t patchSize) { return setArgStateless(memory, patchSize, false); } uint64_t setArgStateless(void *memory, uint32_t patchSize, bool set32BitAddressing); - virtual void setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3Cache) = 0; + virtual void setArgStateful(void *memory, bool forceNonAuxMode, bool programForAuxTranslation) = 0; bool bufferRectPitchSet(const size_t *bufferOrigin, const size_t *region, size_t &bufferRowPitch, @@ -165,7 +165,7 @@ class BufferHw : public Buffer { : Buffer(context, properties, size, memoryStorage, hostPtr, gfxAllocation, zeroCopy, isHostPtrSVM, isObjectRedescribed) {} - void setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3Cache) override; + void setArgStateful(void *memory, bool forceNonAuxMode, bool programForAuxTranslation) override; void appendBufferState(void *memory, Context *context, GraphicsAllocation *gfxAllocation); static Buffer *create(Context *context, diff --git a/runtime/mem_obj/buffer.inl b/runtime/mem_obj/buffer.inl index c076dab125..503e34e842 100644 --- a/runtime/mem_obj/buffer.inl +++ b/runtime/mem_obj/buffer.inl @@ -28,13 +28,12 @@ union SURFACE_STATE_BUFFER_LENGTH { }; template -void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3Cache) { +void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, bool programForAuxTranslation) { using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT; using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; auto surfaceState = reinterpret_cast(memory); - // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address auto bufferAddress = (getGraphicsAllocation() != nullptr) ? getGraphicsAllocation()->getGpuAddress() : reinterpret_cast(getHostPtr()); bufferAddress += this->offset; @@ -42,7 +41,7 @@ void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, boo auto bufferAddressAligned = alignDown(bufferAddress, 4); auto bufferOffset = ptrDiff(bufferAddress, bufferAddressAligned); - auto surfaceSize = alignUp(getSize() + bufferOffset, 4); + auto surfaceSize = alignUp(getSize() + bufferOffset, programForAuxTranslation ? 512 : 4); SURFACE_STATE_BUFFER_LENGTH Length = {0}; Length.Length = static_cast(surfaceSize - 1); @@ -63,7 +62,7 @@ void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, boo surfaceState->setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); surfaceState->setVerticalLineStride(0); surfaceState->setVerticalLineStrideOffset(0); - surfaceState->setMemoryObjectControlState(getMocsValue(disableL3Cache)); + surfaceState->setMemoryObjectControlState(getMocsValue(programForAuxTranslation)); surfaceState->setSurfaceBaseAddress(bufferAddressAligned); Gmm *gmm = graphicsAllocation ? graphicsAllocation->getDefaultGmm() : nullptr; diff --git a/unit_tests/built_ins/built_in_tests.cpp b/unit_tests/built_ins/built_in_tests.cpp index d233e99996..c51cfb2db6 100644 --- a/unit_tests/built_ins/built_in_tests.cpp +++ b/unit_tests/built_ins/built_in_tests.cpp @@ -272,7 +272,7 @@ TEST_F(BuiltInTests, givenInputBufferWhenBuildingNonAuxDispatchInfoForAuxTransla EXPECT_EQ(nullptr, kernel->getKernelArguments().at(1).object); EXPECT_EQ(1u, dispatchInfo.getDim()); - size_t xGws = alignUp(buffer->getSize(), 4) / 4; + size_t xGws = alignUp(buffer->getSize(), 512) / 16; Vec3 gws = {xGws, 1, 1}; EXPECT_EQ(gws, dispatchInfo.getGWS()); } diff --git a/unit_tests/helpers/kernel_binary_helper.cpp b/unit_tests/helpers/kernel_binary_helper.cpp index 6df475f804..77f1e22f1e 100644 --- a/unit_tests/helpers/kernel_binary_helper.cpp +++ b/unit_tests/helpers/kernel_binary_helper.cpp @@ -14,7 +14,7 @@ extern PRODUCT_FAMILY productFamily; -const std::string KernelBinaryHelper::BUILT_INS("6400005806705094984"); +const std::string KernelBinaryHelper::BUILT_INS("7030307152995455603"); KernelBinaryHelper::KernelBinaryHelper(const std::string &name, bool appendOptionsToFileName) { // set mock compiler to return expected kernel diff --git a/unit_tests/main.cpp b/unit_tests/main.cpp index 52896526a1..d5d5560e6d 100644 --- a/unit_tests/main.cpp +++ b/unit_tests/main.cpp @@ -399,8 +399,8 @@ int main(int argc, char **argv) { MockCompilerDebugVars fclDebugVars; MockCompilerDebugVars igcDebugVars; - retrieveBinaryKernelFilename(fclDebugVars.fileName, "6400005806705094984_", ".bc"); - retrieveBinaryKernelFilename(igcDebugVars.fileName, "6400005806705094984_", ".gen"); + retrieveBinaryKernelFilename(fclDebugVars.fileName, "7030307152995455603_", ".bc"); + retrieveBinaryKernelFilename(igcDebugVars.fileName, "7030307152995455603_", ".gen"); gEnvironment->setMockFileNames(fclDebugVars.fileName, igcDebugVars.fileName); gEnvironment->setDefaultDebugVars(fclDebugVars, igcDebugVars, device); diff --git a/unit_tests/mem_obj/buffer_tests.cpp b/unit_tests/mem_obj/buffer_tests.cpp index e87b2ecf60..c8762c32d9 100644 --- a/unit_tests/mem_obj/buffer_tests.cpp +++ b/unit_tests/mem_obj/buffer_tests.cpp @@ -1363,17 +1363,16 @@ HWTEST_F(BufferSetSurfaceTests, givenBufferWithOffsetWhenSetArgStatefulIsCalledT DebugManager.flags.Force32bitAddressing.set(false); } -HWTEST_F(BufferSetSurfaceTests, givenBufferWhenSetArgStatefulWithL3ChacheDisabledIsCalledThenL3CacheShouldBeOff) { +HWTEST_F(BufferSetSurfaceTests, givenBufferWhenSetArgStatefulWithL3ChacheDisabledIsCalledThenL3CacheShouldBeOffAndSizeIsAlignedTo512) { MockContext context; - auto size = MemoryConstants::pageSize; - auto ptr = (void *)alignedMalloc(size * 2, MemoryConstants::pageSize); + auto size = 128; auto retVal = CL_SUCCESS; auto buffer = std::unique_ptr(Buffer::create( &context, - CL_MEM_USE_HOST_PTR, + CL_MEM_READ_WRITE, size, - ptr, + nullptr, retVal)); EXPECT_EQ(CL_SUCCESS, retVal); @@ -1385,8 +1384,8 @@ HWTEST_F(BufferSetSurfaceTests, givenBufferWhenSetArgStatefulWithL3ChacheDisable auto mocs = surfaceState.getMemoryObjectControlState(); auto gmmHelper = device->getGmmHelper(); EXPECT_EQ(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED), mocs); - - alignedFree(ptr); + EXPECT_EQ(128u, surfaceState.getWidth()); + EXPECT_EQ(4u, surfaceState.getHeight()); } HWTEST_F(BufferSetSurfaceTests, givenAlignedCacheableReadOnlyBufferThenChoseOclBufferPolicy) { diff --git a/unit_tests/test_files/6400005806705094984.cl b/unit_tests/test_files/7030307152995455603.cl similarity index 99% rename from unit_tests/test_files/6400005806705094984.cl rename to unit_tests/test_files/7030307152995455603.cl index 82db4c88a5..0eeca64729 100644 --- a/unit_tests/test_files/6400005806705094984.cl +++ b/unit_tests/test_files/7030307152995455603.cl @@ -7,7 +7,8 @@ __kernel void fullCopy(__global const uint* src, __global uint* dst) { unsigned int gid = get_global_id(0); - dst[gid] = src[gid]; + uint4 loaded = vload4(gid, src); + vstore4(loaded, gid, dst); } __kernel void CopyBufferToBufferBytes( diff --git a/unit_tests/test_files/6400005806705094984_options.txt b/unit_tests/test_files/7030307152995455603_options.txt similarity index 100% rename from unit_tests/test_files/6400005806705094984_options.txt rename to unit_tests/test_files/7030307152995455603_options.txt