[CUDA][HIP] Allow comdat for kernels

Two identical instantiations of a template function can be emitted by two TU's
with linkonce_odr linkage without causing duplicate symbols in linker. MSVC
also requires these symbols be in comdat sections. Linux does not require
the symbols in comdat sections to be merged by linker but by default
clang puts them in comdat sections.

If a template kernel is instantiated identically in two TU's. MSVC requires
that them to be in comdat sections, otherwise MSVC linker will diagnose them as
duplicate symbols. However, currently clang does not put instantiated template
kernels in comdat sections, which causes link error for MSVC.

This patch allows putting instantiated template kernels into comdat sections.

Reviewed by: Artem Belevich, Reid Kleckner

Differential Revision: https://reviews.llvm.org/D112492
This commit is contained in:
Yaxun (Sam) Liu
2021-11-03 22:49:24 -04:00
parent 861adaf2ad
commit 80072fde61
4 changed files with 61 additions and 42 deletions

View File

@@ -1147,6 +1147,7 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
Var->setAlignment(CGM.getPointerAlign().getAsAlign());
Var->setDSOLocal(F->isDSOLocal());
Var->setVisibility(F->getVisibility());
CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
KernelHandles[F] = Var;
KernelStubs[Var] = F;
return Var;

View File

@@ -4308,11 +4308,6 @@ static bool shouldBeInCOMDAT(CodeGenModule &CGM, const Decl &D) {
if (!CGM.supportsCOMDAT())
return false;
// Do not set COMDAT attribute for CUDA/HIP stub functions to prevent
// them being "merged" by the COMDAT Folding linker optimization.
if (D.hasAttr<CUDAGlobalAttr>())
return false;
if (D.hasAttr<SelectAnyAttr>())
return true;

View File

@@ -2,16 +2,35 @@
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -x hip\
// RUN: | FileCheck %s
// RUN: | FileCheck -check-prefixes=CHECK,GNU %s
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -x hip\
// RUN: | FileCheck -check-prefix=NEG %s
// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
// RUN: -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
// RUN: %t -o - -x hip\
// RUN: | FileCheck -check-prefixes=CHECK,MSVC %s
// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
// RUN: -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
// RUN: %t -o - -x hip\
// RUN: | FileCheck -check-prefix=NEG %s
#include "Inputs/cuda.h"
// Kernel handles
// Check kernel handles are emitted for non-MSVC target but not for MSVC target.
// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8
// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8
// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8
// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
// GNU: @[[HCKERN:ckernel]] = constant void ()* @[[CSTUB:__device_stub__ckernel]], align 8
// GNU: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]], align 8
// GNU: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]], comdat, align 8
// GNU: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
// MSVC: @[[HCKERN:ckernel]] = dso_local constant void ()* @[[CSTUB:__device_stub__ckernel]], align 8
// MSVC: @[[HNSKERN:"\?nskernel@ns@@YAXXZ.*"]] = dso_local constant void ()* @[[NSSTUB:"\?nskernel@ns@@YAXXZ"]], align 8
// MSVC: @[[HTKERN:"\?\?\$kernelfunc@H@@YAXXZ.*"]] = linkonce_odr dso_local constant void ()* @[[TSTUB:"\?\?\$kernelfunc@H@@YAXXZ.*"]], comdat, align 8
// MSVC: @[[HDKERN:"\?kernel_decl@@YAXXZ.*"]] = external dso_local constant void ()*, align 8
extern "C" __global__ void ckernel() {}
@@ -24,10 +43,10 @@ __global__ void kernelfunc() {}
__global__ void kernel_decl();
void (*kernel_ptr)();
void *void_ptr;
extern "C" void (*kernel_ptr)();
extern "C" void *void_ptr;
void launch(void *kern);
extern "C" void launch(void *kern);
// Device side kernel names
@@ -37,21 +56,22 @@ void launch(void *kern);
// Non-template kernel stub functions
// CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
// CHECK: define{{.*}}@[[CSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
// CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
// CHECK: define{{.*}}@[[NSSTUB]]
// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
// Check kernel stub is called for triple chevron.
// Check kernel stub is used for triple chevron
// CHECK-LABEL: define{{.*}}@_Z4fun1v()
// CHECK-LABEL: define{{.*}}@fun1()
// CHECK: call void @[[CSTUB]]()
// CHECK: call void @[[NSSTUB]]()
// CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]()
// CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
// CHECK: call void @[[TSTUB]]()
// GNU: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
// MSVC: call void @[[DSTUB:"\?kernel_decl@@YAXXZ"]]()
void fun1(void) {
extern "C" void fun1(void) {
ckernel<<<1, 1>>>();
ns::nskernel<<<1, 1>>>();
kernelfunc<int><<<1, 1>>>();
@@ -67,28 +87,28 @@ void fun1(void) {
// CHECK: declare{{.*}}@[[DSTUB]]
// Check kernel handle is used for passing the kernel as a function pointer
// Check kernel handle is used for passing the kernel as a function pointer.
// CHECK-LABEL: define{{.*}}@_Z4fun2v()
// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]]
// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]]
void fun2() {
// CHECK-LABEL: define{{.*}}@fun2()
// CHECK: call void @launch({{.*}}[[HCKERN]]
// CHECK: call void @launch({{.*}}[[HNSKERN]]
// CHECK: call void @launch({{.*}}[[HTKERN]]
// CHECK: call void @launch({{.*}}[[HDKERN]]
extern "C" void fun2() {
launch((void *)ckernel);
launch((void *)ns::nskernel);
launch((void *)kernelfunc<int>);
launch((void *)kernel_decl);
}
// Check kernel handle is used for assigning a kernel to a function pointer
// Check kernel handle is used for assigning a kernel to a function pointer.
// CHECK-LABEL: define{{.*}}@_Z4fun3v()
// CHECK-LABEL: define{{.*}}@fun3()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
// CHECK: store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
void fun3() {
extern "C" void fun3() {
kernel_ptr = ckernel;
kernel_ptr = &ckernel;
void_ptr = (void *)ckernel;
@@ -96,34 +116,37 @@ void fun3() {
}
// Check kernel stub is loaded from kernel handle when function pointer is
// used with triple chevron
// used with triple chevron.
// CHECK-LABEL: define{{.*}}@_Z4fun4v()
// CHECK-LABEL: define{{.*}}@fun4()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
// CHECK: call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream
// CHECK: call i32 @{{.*hipConfigureCall}}
// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()**
// CHECK: %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8
// CHECK: call void %[[STUB]]()
void fun4() {
extern "C" void fun4() {
kernel_ptr = ckernel;
kernel_ptr<<<1,1>>>();
}
// Check kernel handle is passed to a function
// Check kernel handle is passed to a function.
// CHECK-LABEL: define{{.*}}@_Z4fun5v()
// CHECK-LABEL: define{{.*}}@fun5()
// CHECK: store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
// CHECK: %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
// CHECK: %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8*
// CHECK: call void @_Z6launchPv(i8* %[[CAST]])
void fun5() {
// CHECK: call void @launch(i8* %[[CAST]])
extern "C" void fun5() {
kernel_ptr = ckernel;
launch((void *)kernel_ptr);
}
// Check kernel handle is registered.
// CHECK-LABEL: define{{.*}}@__hip_register_globals
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@{{[0-9]*}}
// NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}__device_stub
// NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}kernel_decl

View File

@@ -109,7 +109,7 @@ __host__ __device__ void tests_hd(void *t) {
}
// Make sure that we've generated the kernel used by A::~A.
// DEVICE-LABEL: define dso_local void @_Z1fIiEvT_
// DEVICE-LABEL: define void @_Z1fIiEvT_
// Make sure we've picked deallocator for the correct side of compilation.