[flang][cuda] Defined allocator for unified data (#102189)

CUDA unified variable where set to use the same allocator than managed variable. This patch adds a specific allocator for the unified variables. Currently it will call the managed allocator underneath but we want to have the flexibility to change that in the future.
2026-01-28 01:04:49 +08:00 · 2024-08-06 14:30:31 -07:00
parent 10d7805c4f
commit 388b63243c
4 changed files with 20 additions and 3 deletions
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -36,5 +36,8 @@ void CUFFreeDevice(void *);
 void *CUFAllocManaged(std::size_t);
 void CUFFreeManaged(void *);

+void *CUFAllocUnified(std::size_t);
+void CUFFreeUnified(void *);
+
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -19,8 +19,9 @@ static constexpr unsigned kDefaultAllocator = 0;
 static constexpr unsigned kPinnedAllocatorPos = 1;
 static constexpr unsigned kDeviceAllocatorPos = 2;
 static constexpr unsigned kManagedAllocatorPos = 3;
+static constexpr unsigned kUnifiedAllocatorPos = 4;

-#define MAX_ALLOCATOR 5
+#define MAX_ALLOCATOR 7 // 3 bits are reserved in the descriptor.

 namespace Fortran::runtime {

--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1860,9 +1860,10 @@ static unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
      return kPinnedAllocatorPos;
    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
      return kDeviceAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed ||
-        *cudaAttr == Fortran::common::CUDADataAttr::Unified)
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
      return kManagedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
+      return kUnifiedAllocatorPos;
  }
  return kDefaultAllocator;
 }
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -26,6 +26,8 @@ void CUFRegisterAllocator() {
      kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
  allocatorRegistry.Register(
      kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
+  allocatorRegistry.Register(
+      kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
 }

 void *CUFAllocPinned(std::size_t sizeInBytes) {
@@ -57,4 +59,14 @@ void CUFFreeManaged(void *p) {
  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
 }

+void *CUFAllocUnified(std::size_t sizeInBytes) {
+  // Call alloc managed for the time being.
+  return CUFAllocManaged(sizeInBytes);
+}
+
+void CUFFreeUnified(void *p) {
+  // Call free managed for the time being.
+  CUFFreeManaged(p);
+}
+
 } // namespace Fortran::runtime::cuda