Skip to content

Commit 3d04da5

Browse files
authored
[NVPTX] Add support for Shared Cluster Memory address space [2/2] (#136768)
Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to #135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
1 parent 8c74dc1 commit 3d04da5

File tree

11 files changed

+349
-127
lines changed

11 files changed

+349
-127
lines changed

Diff for: clang/test/CodeGenCUDA/builtins-sm90.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
5050
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
5151
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
5252
out[i++] = (long) __nvvm_mapa(ptr, u);
53-
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
53+
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
5454
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
5555
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
5656
out[i++] = __nvvm_getctarank(ptr);

Diff for: llvm/docs/NVPTXUsage.rst

+30-3
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ The NVPTX back-end uses the following address space mapping:
108108
3 Shared
109109
4 Constant
110110
5 Local
111+
7 Shared Cluster
111112
============= ======================
112113

113114
Every global variable and pointer type is assigned to one of these address
@@ -306,6 +307,32 @@ If the given pointer in the generic address space refers to memory which falls
306307
within the state space of the intrinsic (and therefore could be safely address
307308
space casted to this space), 1 is returned, otherwise 0 is returned.
308309

310+
'``llvm.nvvm.mapa.*``' Intrinsics
311+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
312+
313+
Syntax:
314+
"""""""
315+
316+
.. code-block:: llvm
317+
318+
declare ptr @llvm.nvvm.mapa(ptr %p, i32 %rank)
319+
declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %rank)
320+
321+
Overview:
322+
"""""""""
323+
324+
The '``llvm.nvvm.mapa.*``' intrinsics map a shared memory pointer ``p`` of another CTA with ``%rank`` to the current CTA.
325+
The ``llvm.nvvm.mapa`` form expects a generic pointer to shared memory and returns a generic pointer to shared cluster memory.
326+
The ``llvm.nvvm.mapa.shared.cluster`` form expects a pointer to shared memory and returns a pointer to shared cluster memory.
327+
They corresponds directly to the ``mapa`` and ``mapa.shared.cluster`` PTX instructions.
328+
329+
Semantics:
330+
""""""""""
331+
332+
If the given pointer in the generic address space refers to memory which falls
333+
within the state space of the intrinsic (and therefore could be safely address
334+
space casted to this space), 1 is returned, otherwise 0 is returned.
335+
309336
Arithmetic Intrinsics
310337
---------------------
311338

@@ -552,7 +579,7 @@ Syntax:
552579

553580
.. code-block:: llvm
554581
555-
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
582+
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
556583
557584
Overview:
558585
"""""""""
@@ -616,7 +643,7 @@ Syntax:
616643

617644
.. code-block:: llvm
618645
619-
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
646+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
620647
621648
Overview:
622649
"""""""""
@@ -771,7 +798,7 @@ Syntax:
771798

772799
.. code-block:: llvm
773800
774-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
801+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
775802
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
776803
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
777804
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)

Diff for: llvm/include/llvm/IR/IntrinsicsNVVM.td

+27-26
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,11 @@
127127
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
128128
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
129129

130-
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131-
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132-
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133-
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
130+
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131+
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132+
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133+
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134+
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
134135

135136
//
136137
// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
691692
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
692693
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
693694
list<LLVMType> ArgsTy = !listconcat(
694-
[llvm_shared_ptr_ty, // dst_smem_ptr
695-
llvm_shared_ptr_ty, // mbarrier_smem_ptr
696-
llvm_ptr_ty], // tensormap_ptr
697-
TensorDimsTy, // actual tensor dims
698-
Im2ColOffsetsTy, // im2col offsets
699-
[llvm_i16_ty, // cta_mask
700-
llvm_i64_ty, // cache_hint
701-
llvm_i1_ty, // Flag for cta_mask
702-
llvm_i1_ty] // Flag for cache_hint
695+
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
696+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
697+
llvm_ptr_ty], // tensormap_ptr
698+
TensorDimsTy, // actual tensor dims
699+
Im2ColOffsetsTy, // im2col offsets
700+
[llvm_i16_ty, // cta_mask
701+
llvm_i64_ty, // cache_hint
702+
llvm_i1_ty, // Flag for cta_mask
703+
llvm_i1_ty] // Flag for cache_hint
703704
);
704705

705706
int TempFlagsStartIdx = !add(dim, 5);
@@ -5134,7 +5135,7 @@ def int_nvvm_mapa
51345135
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51355136
"llvm.nvvm.mapa">;
51365137
def int_nvvm_mapa_shared_cluster
5137-
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
5138+
: DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
51385139
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51395140
"llvm.nvvm.mapa.shared.cluster">;
51405141
def int_nvvm_getctarank
@@ -5234,14 +5235,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
52345235
// From Global to Shared Cluster
52355236
def int_nvvm_cp_async_bulk_global_to_shared_cluster
52365237
: DefaultAttrsIntrinsic<[],
5237-
[llvm_shared_ptr_ty, // dst_smem_ptr
5238-
llvm_shared_ptr_ty, // mbarrier_ptr
5239-
llvm_global_ptr_ty, // src_gmem_ptr
5240-
llvm_i32_ty, // copy_size
5241-
llvm_i16_ty, // cta_mask
5242-
llvm_i64_ty, // cache_hint
5243-
llvm_i1_ty, // Flag for cta_mask
5244-
llvm_i1_ty], // Flag for cache_hint
5238+
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
5239+
llvm_shared_ptr_ty, // mbarrier_ptr
5240+
llvm_global_ptr_ty, // src_gmem_ptr
5241+
llvm_i32_ty, // copy_size
5242+
llvm_i16_ty, // cta_mask
5243+
llvm_i64_ty, // cache_hint
5244+
llvm_i1_ty, // Flag for cta_mask
5245+
llvm_i1_ty], // Flag for cache_hint
52455246
[IntrConvergent, IntrArgMemOnly,
52465247
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52475248
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5251,10 +5252,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52515252
// From Shared CTA to Shared Cluster
52525253
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52535254
: DefaultAttrsIntrinsic<[],
5254-
[llvm_shared_ptr_ty, // dst_smem_ptr
5255-
llvm_shared_ptr_ty, // mbarrier_ptr
5256-
llvm_shared_ptr_ty, // src_smem_ptr
5257-
llvm_i32_ty], // copy_size
5255+
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
5256+
llvm_shared_ptr_ty, // mbarrier_ptr
5257+
llvm_shared_ptr_ty, // src_smem_ptr
5258+
llvm_i32_ty], // copy_size
52585259
[IntrConvergent, IntrArgMemOnly,
52595260
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52605261
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,

Diff for: llvm/lib/IR/AutoUpgrade.cpp

+83
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "llvm/Support/AMDGPUAddrSpace.h"
4747
#include "llvm/Support/CommandLine.h"
4848
#include "llvm/Support/ErrorHandling.h"
49+
#include "llvm/Support/NVPTXAddrSpace.h"
4950
#include "llvm/Support/Regex.h"
5051
#include "llvm/TargetParser/Triple.h"
5152
#include <cstdint>
@@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
938939
return false; // No other 'arm.*', 'aarch64.*'.
939940
}
940941

942+
static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
943+
StringRef Name) {
944+
if (Name.consume_front("mapa.shared.cluster"))
945+
if (F->getReturnType()->getPointerAddressSpace() ==
946+
NVPTXAS::ADDRESS_SPACE_SHARED)
947+
return Intrinsic::nvvm_mapa_shared_cluster;
948+
949+
if (Name.consume_front("cp.async.bulk.")) {
950+
Intrinsic::ID ID =
951+
StringSwitch<Intrinsic::ID>(Name)
952+
.Case("global.to.shared.cluster",
953+
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
954+
.Case("shared.cta.to.cluster",
955+
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
956+
.Case("tensor.g2s.im2col.3d",
957+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
958+
.Case("tensor.g2s.im2col.4d",
959+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
960+
.Case("tensor.g2s.im2col.5d",
961+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
962+
.Case("tensor.g2s.tile.1d",
963+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
964+
.Case("tensor.g2s.tile.2d",
965+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
966+
.Case("tensor.g2s.tile.3d",
967+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
968+
.Case("tensor.g2s.tile.4d",
969+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
970+
.Case("tensor.g2s.tile.5d",
971+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
972+
.Default(Intrinsic::not_intrinsic);
973+
974+
if (ID != Intrinsic::not_intrinsic)
975+
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
976+
NVPTXAS::ADDRESS_SPACE_SHARED)
977+
return ID;
978+
}
979+
980+
return Intrinsic::not_intrinsic;
981+
}
982+
941983
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
942984
if (Name.consume_front("fma.rn."))
943985
return StringSwitch<Intrinsic::ID>(Name)
@@ -1278,6 +1320,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
12781320
}
12791321
}
12801322

1323+
// Upgrade Distributed Shared Memory Intrinsics
1324+
Intrinsic::ID IID = shouldUpgradeNVPTXSharedClusterIntrinsic(F, Name);
1325+
if (IID != Intrinsic::not_intrinsic) {
1326+
rename(F);
1327+
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
1328+
return true;
1329+
}
1330+
12811331
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
12821332
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
12831333
//
@@ -4718,6 +4768,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47184768
CI->eraseFromParent();
47194769
return;
47204770
}
4771+
case Intrinsic::nvvm_mapa_shared_cluster: {
4772+
// Create a new call with the correct address space.
4773+
NewCall =
4774+
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
4775+
Value *Res = NewCall;
4776+
Res = Builder.CreateAddrSpaceCast(
4777+
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
4778+
NewCall->takeName(CI);
4779+
CI->replaceAllUsesWith(Res);
4780+
CI->eraseFromParent();
4781+
return;
4782+
}
4783+
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
4784+
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
4785+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4786+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4787+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4788+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
4789+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
4790+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
4791+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
4792+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4793+
// Create a new call with the correct address space.
4794+
SmallVector<Value *, 4> Args(CI->args());
4795+
Args[0] = Builder.CreateAddrSpaceCast(
4796+
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
4797+
4798+
NewCall = Builder.CreateCall(NewFn, Args);
4799+
NewCall->takeName(CI);
4800+
CI->replaceAllUsesWith(NewCall);
4801+
CI->eraseFromParent();
4802+
return;
4803+
}
47214804
case Intrinsic::riscv_sha256sig0:
47224805
case Intrinsic::riscv_sha256sig1:
47234806
case Intrinsic::riscv_sha256sum0:

Diff for: llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll

+57
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,21 @@ declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)
5959
declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
6060
declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)
6161

62+
declare ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32)
63+
64+
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
65+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
66+
67+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
68+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
69+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
70+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
71+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
72+
73+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
74+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
75+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
76+
6277
; CHECK-LABEL: @simple_upgrade
6378
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
6479
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -254,3 +269,45 @@ define i32 @atomics(ptr %p0, i32 %a) {
254269
ret i32 %r2
255270
}
256271

272+
; CHECK-LABEL: @nvvm_shared_cluster_intrinsics
273+
define void @nvvm_shared_cluster_intrinsics(ptr addrspace(3) %p0, i32 %offset) {
274+
; CHECK: %r = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
275+
%r = call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
276+
ret void
277+
}
278+
279+
; CHECK-LABEL: @nvvm_cp_async_bulk_intrinsics
280+
define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, ptr addrspace(3) %src_shared, i32 %size) {
281+
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
282+
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
283+
call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
284+
call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
285+
ret void
286+
}
287+
288+
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
289+
define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
290+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false)
291+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false)
292+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false)
293+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0)
294+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0)
295+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0)
296+
ret void
297+
}
298+
299+
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
300+
define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
301+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false)
302+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false)
303+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false)
304+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false)
305+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false)
306+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0)
307+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0)
308+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0)
309+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 0, i1 0)
310+
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 0, i1 0)
311+
ret void
312+
}
313+

0 commit comments

Comments
 (0)