From 5739d526c46923344637d9e559be714e50d731b6 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Mon, 6 Jul 2020 13:09:12 +0200 Subject: [PATCH] Broadcast signal to all threads while handling USM pagefault Related-To: NEO-4721 Change-Id: I77185f8db2576f626c1b6b5615ab5d8f9b22076f Signed-off-by: Lukasz Jobczyk --- .../cpu_page_fault_manager.cpp | 6 ++ .../cpu_page_fault_manager.h | 3 + .../linux/cpu_page_fault_manager_linux.cpp | 63 ++++++++++++++++--- .../linux/cpu_page_fault_manager_linux.h | 7 ++- .../cpu_page_fault_manager_windows.cpp | 3 + .../windows/cpu_page_fault_manager_windows.h | 2 + .../cpu_page_fault_manager_linux_tests.cpp | 47 ++++++++++++++ .../mock_cpu_page_fault_manager.h | 1 + 8 files changed, 122 insertions(+), 10 deletions(-) diff --git a/shared/source/page_fault_manager/cpu_page_fault_manager.cpp b/shared/source/page_fault_manager/cpu_page_fault_manager.cpp index 6206569c0e..416acc5abf 100644 --- a/shared/source/page_fault_manager/cpu_page_fault_manager.cpp +++ b/shared/source/page_fault_manager/cpu_page_fault_manager.cpp @@ -66,6 +66,7 @@ bool PageFaultManager::verifyPageFault(void *ptr) { auto allocPtr = alloc.first; auto &pageFaultData = alloc.second; if (ptr >= allocPtr && ptr < ptrOffset(allocPtr, pageFaultData.size)) { + this->broadcastWaitSignal(); this->allowCPUMemoryAccess(allocPtr, pageFaultData.size); this->setAubWritable(true, allocPtr, pageFaultData.unifiedMemoryManager); this->transferToCpu(allocPtr, pageFaultData.size, pageFaultData.cmdQ); @@ -81,4 +82,9 @@ void PageFaultManager::setAubWritable(bool writable, void *ptr, SVMAllocsManager auto gpuAlloc = unifiedMemoryManager->getSVMAlloc(ptr)->gpuAllocations.getDefaultGraphicsAllocation(); gpuAlloc->setAubWritable(writable, GraphicsAllocation::allBanks); } + +void PageFaultManager::waitForCopy() { + std::unique_lock lock{mtx}; +} + } // namespace NEO diff --git a/shared/source/page_fault_manager/cpu_page_fault_manager.h b/shared/source/page_fault_manager/cpu_page_fault_manager.h index 4b93503b4f..5ebaa17ad6 100644 --- a/shared/source/page_fault_manager/cpu_page_fault_manager.h +++ b/shared/source/page_fault_manager/cpu_page_fault_manager.h @@ -38,6 +38,9 @@ class PageFaultManager : public NonCopyableOrMovableClass { virtual void allowCPUMemoryAccess(void *ptr, size_t size) = 0; virtual void protectCPUMemoryAccess(void *ptr, size_t size) = 0; + virtual void broadcastWaitSignal() = 0; + MOCKABLE_VIRTUAL void waitForCopy(); + MOCKABLE_VIRTUAL bool verifyPageFault(void *ptr); MOCKABLE_VIRTUAL void transferToCpu(void *ptr, size_t size, void *cmdQ); MOCKABLE_VIRTUAL void transferToGpu(void *ptr, void *cmdQ); diff --git a/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.cpp b/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.cpp index 6d642209b1..8b53063278 100644 --- a/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.cpp +++ b/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.cpp @@ -9,7 +9,11 @@ #include "shared/source/helpers/debug_helpers.h" +#include #include +#include +#include +#include namespace NEO { std::unique_ptr PageFaultManager::create() { @@ -20,7 +24,9 @@ std::function PageFaultManager PageFaultManagerLinux::PageFaultManagerLinux() { pageFaultHandler = [&](int signal, siginfo_t *info, void *context) { - if (!this->verifyPageFault(info->si_addr)) { + if (signal == SIGUSR1) { + this->waitForCopy(); + } else if (!this->verifyPageFault(info->si_addr)) { callPreviousHandler(signal, info, context); } }; @@ -28,13 +34,20 @@ PageFaultManagerLinux::PageFaultManagerLinux() { struct sigaction pageFaultManagerHandler = {}; pageFaultManagerHandler.sa_flags = SA_SIGINFO; pageFaultManagerHandler.sa_sigaction = pageFaultHandlerWrapper; - auto retVal = sigaction(SIGSEGV, &pageFaultManagerHandler, &previousHandler); + + auto retVal = sigaction(SIGSEGV, &pageFaultManagerHandler, &previousPageFaultHandler); + UNRECOVERABLE_IF(retVal != 0); + + retVal = sigaction(SIGUSR1, &pageFaultManagerHandler, &previousUserSignalHandler); UNRECOVERABLE_IF(retVal != 0); } PageFaultManagerLinux::~PageFaultManagerLinux() { if (!previousHandlerRestored) { - auto retVal = sigaction(SIGSEGV, &previousHandler, nullptr); + auto retVal = sigaction(SIGSEGV, &previousPageFaultHandler, nullptr); + UNRECOVERABLE_IF(retVal != 0); + + retVal = sigaction(SIGUSR1, &previousUserSignalHandler, nullptr); UNRECOVERABLE_IF(retVal != 0); } } @@ -54,18 +67,50 @@ void PageFaultManagerLinux::protectCPUMemoryAccess(void *ptr, size_t size) { } void PageFaultManagerLinux::callPreviousHandler(int signal, siginfo_t *info, void *context) { - if (previousHandler.sa_flags & SA_SIGINFO) { - previousHandler.sa_sigaction(signal, info, context); + if (previousPageFaultHandler.sa_flags & SA_SIGINFO) { + previousPageFaultHandler.sa_sigaction(signal, info, context); } else { - if (previousHandler.sa_handler == SIG_DFL) { - auto retVal = sigaction(SIGSEGV, &previousHandler, nullptr); + if (previousPageFaultHandler.sa_handler == SIG_DFL) { + auto retVal = sigaction(SIGSEGV, &previousPageFaultHandler, nullptr); UNRECOVERABLE_IF(retVal != 0); previousHandlerRestored = true; - } else if (previousHandler.sa_handler == SIG_IGN) { + } else if (previousPageFaultHandler.sa_handler == SIG_IGN) { return; } else { - previousHandler.sa_handler(signal); + previousPageFaultHandler.sa_handler(signal); } } } + +/* This function is a WA for USM issue in multithreaded environment + While handling page fault, before copy starts, user signal (SIGUSR1) + is broadcasted to ensure that every thread received signal and is + stucked on PageFaultHandler's mutex before copy from GPU to CPU proceeds. */ +void PageFaultManagerLinux::broadcastWaitSignal() { + auto selfThreadId = syscall(__NR_gettid); + + auto procDir = opendir("/proc/self/task"); + UNRECOVERABLE_IF(!procDir); + + struct dirent *dirEntry; + while ((dirEntry = readdir(procDir)) != NULL) { + if (dirEntry->d_name[0] == '.') { + continue; + } + + int threadId = atoi(dirEntry->d_name); + if (threadId == selfThreadId) { + continue; + } + + sendSignalToThread(threadId); + } + + closedir(procDir); +} + +void PageFaultManagerLinux::sendSignalToThread(int threadId) { + syscall(SYS_tkill, threadId, SIGUSR1); +} + } // namespace NEO diff --git a/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.h b/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.h index 73d60fbcb9..8d43176cdf 100644 --- a/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.h +++ b/shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.h @@ -24,10 +24,15 @@ class PageFaultManagerLinux : public PageFaultManager { void allowCPUMemoryAccess(void *ptr, size_t size) override; void protectCPUMemoryAccess(void *ptr, size_t size) override; + void broadcastWaitSignal() override; + MOCKABLE_VIRTUAL void sendSignalToThread(int threadId); + void callPreviousHandler(int signal, siginfo_t *info, void *context); bool previousHandlerRestored = false; static std::function pageFaultHandler; - struct sigaction previousHandler = {}; + + struct sigaction previousPageFaultHandler = {}; + struct sigaction previousUserSignalHandler = {}; }; } // namespace NEO diff --git a/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.cpp b/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.cpp index ce97ccdf83..9da6200018 100644 --- a/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.cpp +++ b/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.cpp @@ -50,4 +50,7 @@ void PageFaultManagerWindows::protectCPUMemoryAccess(void *ptr, size_t size) { auto retVal = VirtualProtect(ptr, size, PAGE_NOACCESS, &previousState); UNRECOVERABLE_IF(!retVal); } + +void PageFaultManagerWindows::broadcastWaitSignal() {} + } // namespace NEO diff --git a/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.h b/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.h index 3184ad26e1..76b2bee4a6 100644 --- a/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.h +++ b/shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.h @@ -25,6 +25,8 @@ class PageFaultManagerWindows : public PageFaultManager { void allowCPUMemoryAccess(void *ptr, size_t size) override; void protectCPUMemoryAccess(void *ptr, size_t size) override; + void broadcastWaitSignal() override; + static std::function pageFaultHandler; PVOID previousHandler; }; diff --git a/shared/test/unit_test/page_fault_manager/linux/cpu_page_fault_manager_linux_tests.cpp b/shared/test/unit_test/page_fault_manager/linux/cpu_page_fault_manager_linux_tests.cpp index 974fc8fa30..3f647f441c 100644 --- a/shared/test/unit_test/page_fault_manager/linux/cpu_page_fault_manager_linux_tests.cpp +++ b/shared/test/unit_test/page_fault_manager/linux/cpu_page_fault_manager_linux_tests.cpp @@ -13,12 +13,59 @@ #include #include +#include +#include +#include using namespace NEO; using PageFaultManagerLinuxTest = PageFaultManagerConfigFixture; using MockPageFaultManagerLinux = MockPageFaultManagerHandlerInvoke; +struct UserSignalMockPageFaultManagerLinux : public PageFaultManagerLinux { + using PageFaultManager::verifyPageFault; + + UserSignalMockPageFaultManagerLinux() { + ownThread = std::thread([&]() { + while (!waitForCopyCalled) { + if (ownThreadId == -1) { + ownThreadId = static_cast(syscall(__NR_gettid)); + } + } + }); + while (ownThreadId == -1) { + } + } + + void allowCPUMemoryAccess(void *ptr, size_t size) override {} + void transferToCpu(void *ptr, size_t size, void *cmdQ) override {} + void setAubWritable(bool writable, void *ptr, SVMAllocsManager *unifiedMemoryManager) override {} + + void sendSignalToThread(int threadId) override { + PageFaultManagerLinux::sendSignalToThread(ownThreadId); + } + + void waitForCopy() override { + PageFaultManagerLinux::waitForCopy(); + waitForCopyCalled = true; + } + + std::thread ownThread; + int ownThreadId = -1; + bool waitForCopyCalled = false; +}; + +TEST_F(PageFaultManagerLinuxTest, whenVeryfyingPageFaultThenUserSignalIsSentToOtherThreads) { + auto pageFaultManager = std::make_unique(); + + auto alloc = reinterpret_cast(0x1); + pageFaultManager->insertAllocation(alloc, 10, nullptr, nullptr); + pageFaultManager->verifyPageFault(alloc); + pageFaultManager->ownThread.join(); + + EXPECT_TRUE(pageFaultManager->waitForCopyCalled); +} + TEST_F(PageFaultManagerLinuxTest, whenPageFaultIsRaisedThenHandlerIsInvoked) { auto pageFaultManager = std::make_unique(); EXPECT_FALSE(pageFaultManager->handlerInvoked); diff --git a/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h b/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h index 3382d5e760..7e6e8a93a1 100644 --- a/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h +++ b/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h @@ -49,6 +49,7 @@ class MockPageFaultManager : public PageFaultManager { void baseGpuTransfer(void *ptr, void *cmdQ) { PageFaultManager::transferToGpu(ptr, cmdQ); } + void broadcastWaitSignal() override {} int allowMemoryAccessCalled = 0; int protectMemoryCalled = 0;