[libc] Optimize the RPC memory copy for the AMDGPU target (#70467)

Summary: We previously made the change to make the GPU target use builtin implementations of memory copy functions. However, this had the negative effect of massively increasing register usages when using the printing interface. For example, a `printf` call went from using 25 VGPRs to 54 simply because of using the builtin. However, we probably want to still export the builitin, but for the RPC interface we heavily prefer small resource usage over the performance gains of fully unrolling this loop. For NVPTX however, the builtin implementation causes the resource usage to go down (36 registers total for a regular `fputs` call) so we will maintain that implementation. I think specializing this is the right call as we will always prefer the implementation with the smallest resource footprint for this interface, as performance is already going to be heavily bottlenecked by the use of fine-grained memory.
2026-01-20 19:07:53 +08:00 · 2023-10-27 14:55:37 -05:00
parent 741579930e
commit 8e447a123b
2 changed files with 18 additions and 5 deletions
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -24,7 +24,6 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/GPU/utils.h"
-#include "src/string/memory_utils/inline_memcpy.h"

 #include <stdint.h>

@@ -458,7 +457,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
            ? sizeof(Buffer::data) - sizeof(uint64_t)
            : lane_value(size, id);
-    inline_memcpy(&buffer->data[1], lane_value(src, id), len);
+    rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
  });
  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
  uint64_t mask = process.packet[index].header.mask;
@@ -468,7 +467,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
                         ? sizeof(Buffer::data)
                         : lane_value(size, id) - idx;
      if (idx < lane_value(size, id))
-        inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
+        rpc_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
    });
    idx += sizeof(Buffer::data);
  }
@@ -491,7 +490,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
            ? sizeof(Buffer::data) - sizeof(uint64_t)
            : lane_value(size, id);
-    inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
+    rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
  });
  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
  uint64_t mask = process.packet[index].header.mask;
@@ -501,7 +500,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
                         ? sizeof(Buffer::data)
                         : lane_value(size, id) - idx;
      if (idx < lane_value(size, id))
-        inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
+        rpc_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
    });
    idx += sizeof(Buffer::data);
  }
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -13,6 +13,8 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/properties/architectures.h"
+#include "src/string/memory_utils/generic/byte_per_byte.h"
+#include "src/string/memory_utils/inline_memcpy.h"

 namespace LIBC_NAMESPACE {
 namespace rpc {
@@ -64,6 +66,18 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
    return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
 }

+/// Wrapper around the optimal memory copy implementation for the target.
+LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
+  // The built-in memcpy prefers to fully unroll loops. We want to minimize
+  // resource usage so we use a single nounroll loop implementation.
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  inline_memcpy_byte_per_byte(reinterpret_cast<Ptr>(dst),
+                              reinterpret_cast<CPtr>(src), count);
+#else
+  inline_memcpy(dst, src, count);
+#endif
+}
+
 } // namespace rpc
 } // namespace LIBC_NAMESPACE