20 using namespace clang;
21 using namespace CodeGen;
27 OMPRTL_NVPTX__kmpc_kernel_init,
29 OMPRTL_NVPTX__kmpc_kernel_deinit,
32 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
34 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
38 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41 OMPRTL_NVPTX__kmpc_kernel_parallel,
43 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46 OMPRTL_NVPTX__kmpc_serialized_parallel,
49 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52 OMPRTL_NVPTX__kmpc_shuffle_int32,
55 OMPRTL_NVPTX__kmpc_shuffle_int64,
61 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
71 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
73 OMPRTL_NVPTX__kmpc_end_reduce_nowait
83 llvm::BasicBlock *ContBlock =
nullptr;
88 bool Conditional =
false)
89 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
98 CGF.
Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
116 class ExecutionModeRAII {
128 ~ExecutionModeRAII() { Mode = SavedMode; }
140 LaneIDMask = WarpSize - 1,
143 GlobalMemoryAlignment = 256,
157 llvm::Intrinsic::getDeclaration(
158 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
165 llvm::Intrinsic::getDeclaration(
166 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
190 llvm::Intrinsic::getDeclaration(
191 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
192 "nvptx_num_threads");
206 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
226 bool IsInSpmdExecutionMode =
false) {
228 return IsInSpmdExecutionMode
248 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
249 Bld.CreateNot(Mask),
"master_tid");
252 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
254 : WorkerFn(
nullptr), CGFI(
nullptr) {
255 createWorkerFunction(CGM);
258 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
269 bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode()
const {
270 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
277 switch (DirectiveKind) {
279 case OMPD_target_teams:
280 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
281 case OMPD_target_parallel:
282 case OMPD_target_parallel_for:
283 case OMPD_target_parallel_for_simd:
284 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
286 llvm_unreachable(
"Unsupported directive on NVPTX device.");
288 llvm_unreachable(
"Unsupported directive on NVPTX device.");
292 StringRef ParentName,
293 llvm::Function *&OutlinedFn,
294 llvm::Constant *&OutlinedFnID,
297 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
298 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
299 EntryFunctionState EST;
300 WorkerFunctionState WST(CGM);
302 WrapperFunctionsMap.clear();
307 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
308 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
312 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
313 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
314 : RT(RT), EST(EST), WST(WST) {}
316 RT.emitGenericEntryHeader(CGF, EST, WST);
319 RT.emitGenericEntryFooter(CGF, EST);
321 } Action(*
this, EST, WST);
323 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
324 IsOffloadEntry, CodeGen);
327 emitWorkerFunction(WST);
331 WST.WorkerFn->setName(OutlinedFn->getName() +
"_worker");
335 void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(
CodeGenFunction &CGF,
336 EntryFunctionState &EST,
337 WorkerFunctionState &WST) {
347 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
350 emitCall(CGF, WST.WorkerFn);
356 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
365 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
368 void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(
CodeGenFunction &CGF,
369 EntryFunctionState &EST) {
373 llvm::BasicBlock *TerminateBB = CGF.
createBasicBlock(
".termination.notifier");
381 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
388 EST.ExitBB =
nullptr;
392 StringRef ParentName,
393 llvm::Function *&OutlinedFn,
394 llvm::Constant *&OutlinedFnID,
397 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
398 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
399 EntryFunctionState EST;
404 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
409 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
411 : RT(RT), EST(EST), D(D) {}
413 RT.emitSpmdEntryHeader(CGF, EST, D);
416 RT.emitSpmdEntryFooter(CGF, EST);
418 } Action(*
this, EST, D);
420 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
421 IsOffloadEntry, CodeGen);
424 void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
440 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
447 EntryFunctionState &EST) {
457 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
461 EST.ExitBB =
nullptr;
472 (void)
new llvm::GlobalVariable(
474 llvm::GlobalValue::WeakAnyLinkage,
475 llvm::ConstantInt::get(CGM.
Int8Ty, Mode), Name + Twine(
"_exec_mode"));
478 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
484 emitWorkerLoop(CGF, WST);
489 WorkerFunctionState &WST) {
502 llvm::BasicBlock *SelectWorkersBB = CGF.
createBasicBlock(
".select.workers");
504 llvm::BasicBlock *TerminateBB = CGF.
createBasicBlock(
".terminate.parallel");
529 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
534 Bld.CreateIsNull(Bld.
CreateLoad(WorkFn),
"should_terminate");
535 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
540 Bld.CreateIsNotNull(Bld.
CreateLoad(ExecStatus),
"is_active");
541 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
550 for (
auto *W : Work) {
555 Bld.CreateICmpEQ(Bld.
CreateLoad(WorkFn),
ID,
"work_match");
559 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
574 emitCall(CGF, W, {Bld.getInt16(0),
586 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
606 llvm::Constant *RTLFn =
nullptr;
607 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
608 case OMPRTL_NVPTX__kmpc_kernel_init: {
612 llvm::FunctionType *FnTy =
613 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
617 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
620 llvm::FunctionType *FnTy =
621 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
625 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
629 llvm::FunctionType *FnTy =
630 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
634 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
636 llvm::FunctionType *FnTy =
641 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
648 llvm::FunctionType *FnTy =
649 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
653 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
659 llvm::FunctionType *FnTy =
660 llvm::FunctionType::get(RetTy, TypeParams,
false);
664 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
666 llvm::FunctionType *FnTy =
671 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
675 llvm::FunctionType *FnTy =
676 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
680 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
684 llvm::FunctionType *FnTy =
685 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
689 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
693 llvm::FunctionType *FnTy =
694 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
698 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
702 llvm::FunctionType *FnTy =
703 llvm::FunctionType::get(CGM.
Int64Ty, TypeParams,
false);
707 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
715 auto *ShuffleReduceFnTy =
716 llvm::FunctionType::get(CGM.
VoidTy, ShuffleReduceTypeParams,
719 auto *InterWarpCopyFnTy =
720 llvm::FunctionType::get(CGM.
VoidTy, InterWarpCopyTypeParams,
726 ShuffleReduceFnTy->getPointerTo(),
727 InterWarpCopyFnTy->getPointerTo()};
728 llvm::FunctionType *FnTy =
729 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
731 FnTy,
"__kmpc_nvptx_parallel_reduce_nowait");
734 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
746 auto *ShuffleReduceFnTy =
747 llvm::FunctionType::get(CGM.
VoidTy, ShuffleReduceTypeParams,
750 auto *InterWarpCopyFnTy =
751 llvm::FunctionType::get(CGM.
VoidTy, InterWarpCopyTypeParams,
755 auto *CopyToScratchpadFnTy =
756 llvm::FunctionType::get(CGM.
VoidTy, CopyToScratchpadTypeParams,
760 auto *LoadReduceFnTy =
761 llvm::FunctionType::get(CGM.
VoidTy, LoadReduceTypeParams,
767 ShuffleReduceFnTy->getPointerTo(),
768 InterWarpCopyFnTy->getPointerTo(),
769 CopyToScratchpadFnTy->getPointerTo(),
770 LoadReduceFnTy->getPointerTo()};
771 llvm::FunctionType *FnTy =
772 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
774 FnTy,
"__kmpc_nvptx_teams_reduce_nowait");
777 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
780 llvm::FunctionType *FnTy =
781 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
783 FnTy,
"__kmpc_nvptx_end_reduce_nowait");
790 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *
ID,
791 llvm::Constant *Addr,
792 uint64_t Size, int32_t) {
793 auto *F = dyn_cast<llvm::Function>(Addr);
798 llvm::Module *M = F->getParent();
799 llvm::LLVMContext &Ctx = M->getContext();
802 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(
"nvvm.annotations");
804 llvm::Metadata *MDVals[] = {
805 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx,
"kernel"),
806 llvm::ConstantAsMetadata::get(
807 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
809 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
812 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
814 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
819 assert(!ParentName.empty() &&
"Invalid target region parent name!");
824 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
825 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
828 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
829 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
834 "Unknown programming model for OpenMP directive on NVPTX target.");
843 llvm_unreachable(
"OpenMP NVPTX can only handle device code.");
851 if (isInSpmdExecutionMode())
862 if (isInSpmdExecutionMode())
869 const Expr *NumTeams,
870 const Expr *ThreadLimit,
877 auto *OutlinedFun = cast<llvm::Function>(
879 D, ThreadIDVar, InnermostKind, CodeGen));
880 if (!isInSpmdExecutionMode()) {
881 llvm::Function *WrapperFun =
882 createDataSharingWrapper(OutlinedFun, D);
883 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
894 D, ThreadIDVar, InnermostKind, CodeGen);
895 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
896 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
897 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
898 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
916 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
917 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
918 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
928 if (isInSpmdExecutionMode())
929 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
931 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
934 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
937 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
938 llvm::Function *WFn = WrapperFunctionsMap[Fn];
939 assert(WFn &&
"Wrapper function does not exist!");
944 auto &&L0ParallelGen = [
this, WFn, &CapturedVars](
CodeGenFunction &CGF,
950 if (!CapturedVars.empty()) {
952 CGF.CurFn->addFnAttr(
"has-nvptx-shared-depot");
955 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
960 Bld.getInt32(CapturedVars.size()),
971 CGF.EmitLoadOfPointer(SharedArgs,
974 Idx, CGF.getPointerSize());
976 CGF.EmitStoreOfScalar(PtrV, Dst,
false,
983 ID, llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),
984 Bld.getInt32(0), Bld.getInt16(1)};
1003 Work.emplace_back(WFn);
1010 auto &&SeqGen = [
this, Fn, &CapturedVars, &Args, Loc](
CodeGenFunction &CGF,
1017 OutlinedFnArgs.push_back(
1018 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
1019 OutlinedFnArgs.push_back(
1020 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
1021 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1026 NVPTXActionTy Action(
1044 void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1053 OutlinedFnArgs.push_back(
1054 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
1055 OutlinedFnArgs.push_back(
1056 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
1057 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1067 auto &CGM = CGF.
CGM;
1074 assert(Size <= 8 &&
"Unsupported bitwidth in shuffle instruction.");
1077 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1078 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1082 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1089 {ElemCast,
Offset, WarpSize});
1122 auto &CGM = CGF.
CGM;
1126 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1127 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1128 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1133 unsigned Size = Privates.size();
1134 for (
auto &Private : Privates) {
1139 bool ShuffleInElement =
false;
1142 bool UpdateDestListPtr =
false;
1145 bool IncrScratchpadSrc =
false;
1146 bool IncrScratchpadDest =
false;
1149 case RemoteLaneToThread: {
1156 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1160 DestElementPtrAddr =
1163 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1164 ShuffleInElement =
true;
1165 UpdateDestListPtr =
true;
1175 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1179 DestElementPtrAddr =
1185 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1186 DestElementAddr = Bld.CreateElementBitCast(
1190 case ThreadToScratchpad: {
1197 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1201 unsigned ElementSizeInChars =
1202 C.getTypeSizeInChars(Private->getType()).getQuantity();
1203 auto *CurrentOffset =
1204 Bld.CreateMul(llvm::ConstantInt::get(CGM.
SizeTy, ElementSizeInChars),
1206 auto *ScratchPadElemAbsolutePtrVal =
1207 Bld.CreateAdd(DestBase.
getPointer(), CurrentOffset);
1208 ScratchPadElemAbsolutePtrVal =
1209 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1211 Address(ScratchPadElemAbsolutePtrVal,
1212 C.getTypeAlignInChars(Private->getType()));
1213 DestElementAddr = Bld.CreateElementBitCast(
1215 IncrScratchpadDest =
true;
1218 case ScratchpadToThread: {
1221 unsigned ElementSizeInChars =
1222 C.getTypeSizeInChars(Private->getType()).getQuantity();
1223 auto *CurrentOffset =
1224 Bld.CreateMul(llvm::ConstantInt::get(CGM.
SizeTy, ElementSizeInChars),
1226 auto *ScratchPadElemAbsolutePtrVal =
1227 Bld.CreateAdd(SrcBase.
getPointer(), CurrentOffset);
1228 ScratchPadElemAbsolutePtrVal =
1229 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1230 SrcElementAddr =
Address(ScratchPadElemAbsolutePtrVal,
1231 C.getTypeAlignInChars(Private->getType()));
1232 IncrScratchpadSrc =
true;
1236 DestElementPtrAddr =
1239 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1240 UpdateDestListPtr =
true;
1247 SrcElementAddr = Bld.CreateElementBitCast(
1255 if (ShuffleInElement) {
1269 if (UpdateDestListPtr) {
1272 DestElementPtrAddr,
false,
1279 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1282 unsigned ElementSizeInChars =
1283 C.getTypeSizeInChars(Private->getType()).getQuantity();
1284 ScratchpadBasePtr = Bld.CreateAdd(
1286 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1287 CGM.
SizeTy, ElementSizeInChars)));
1290 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1291 llvm::ConstantInt::get(CGM.
SizeTy, 1));
1292 ScratchpadBasePtr = Bld.CreateSDiv(
1294 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1295 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1296 llvm::ConstantInt::get(CGM.
SizeTy, 1));
1297 ScratchpadBasePtr = Bld.CreateMul(
1299 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1301 if (IncrScratchpadDest)
1327 auto Int32Ty = C.getIntTypeForBitwidth(32,
true);
1345 Args.push_back(&ReduceListArg);
1346 Args.push_back(&ScratchPadArg);
1347 Args.push_back(&IndexArg);
1348 Args.push_back(&WidthArg);
1349 Args.push_back(&ShouldReduceArg);
1354 "_omp_reduction_load_and_reduce", &CGM.
getModule());
1367 Bld.CreatePointerBitCastOrAddrSpaceCast(
1395 Bld.CreatePtrToInt(ScratchPadBase, CGM.
SizeTy);
1401 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_red_list");
1405 SrcDataAddr, RemoteReduceList,
1414 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1415 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1420 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1421 ReduceListAddr.getPointer(), CGF.
VoidPtrTy);
1422 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1423 RemoteReduceList.getPointer(), CGF.
VoidPtrTy);
1425 Bld.CreateBr(MergeBB);
1431 RemoteReduceList, ReduceListAddr);
1432 Bld.CreateBr(MergeBB);
1451 auto Int32Ty = C.getIntTypeForBitwidth(32,
true);
1466 Args.push_back(&ReduceListArg);
1467 Args.push_back(&ScratchPadArg);
1468 Args.push_back(&IndexArg);
1469 Args.push_back(&WidthArg);
1474 "_omp_reduction_copy_to_scratchpad", &CGM.
getModule());
1486 Bld.CreatePointerBitCastOrAddrSpaceCast(
1510 Bld.CreatePtrToInt(ScratchPadBase, CGM.
SizeTy);
1514 SrcDataAddr, DestDataAddr,
1548 C.getIntTypeForBitwidth(32,
true),
1551 Args.push_back(&ReduceListArg);
1552 Args.push_back(&NumWarpsArg);
1557 "_omp_reduction_inter_warp_copy_func", &CGM.
getModule());
1574 const char *TransferMediumName =
1575 "__openmp_nvptx_data_transfer_temporary_storage";
1576 llvm::GlobalVariable *TransferMedium =
1577 M.getGlobalVariable(TransferMediumName);
1578 if (!TransferMedium) {
1579 auto *Ty = llvm::ArrayType::get(CGM.
Int64Ty, WarpSize);
1581 TransferMedium =
new llvm::GlobalVariable(
1583 false, llvm::GlobalVariable::CommonLinkage,
1584 llvm::Constant::getNullValue(Ty), TransferMediumName,
1585 nullptr, llvm::GlobalVariable::NotThreadLocal,
1586 SharedAddressSpace);
1598 Bld.CreatePointerBitCastOrAddrSpaceCast(
1605 for (
auto &Private : Privates) {
1616 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0),
"warp_master");
1617 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1622 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.
getPointerSize());
1627 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1628 ElemPtr = Bld.CreateElementBitCast(
1636 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1637 TransferMedium, {llvm::Constant::getNullValue(CGM.
Int64Ty), WarpID});
1638 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1641 MediumPtr = Bld.CreateElementBitCast(
1645 Bld.CreateStore(Elem, MediumPtr);
1647 Bld.CreateBr(MergeBB);
1650 Bld.CreateBr(MergeBB);
1658 auto *NumActiveThreads = Bld.CreateNSWMul(
1671 auto IsActiveThread =
1672 Bld.CreateICmpULT(ThreadID, NumWarpsVal,
"is_active_thread");
1673 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1678 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1679 TransferMedium, {llvm::Constant::getNullValue(CGM.
Int64Ty), ThreadID});
1680 Address SrcMediumPtr(SrcMediumPtrVal,
1681 C.getTypeAlignInChars(Private->getType()));
1683 SrcMediumPtr = Bld.CreateElementBitCast(
1690 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.
getPointerSize());
1694 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1695 TargetElemPtr = Bld.CreateElementBitCast(
1701 Bld.CreateBr(W0MergeBB);
1704 Bld.CreateBr(W0MergeBB);
1800 Args.push_back(&ReduceListArg);
1801 Args.push_back(&LaneIDArg);
1802 Args.push_back(&RemoteLaneOffsetArg);
1803 Args.push_back(&AlgoVerArg);
1808 "_omp_reduction_shuffle_and_reduce_func", &CGM.
getModule());
1820 Bld.CreatePointerBitCastOrAddrSpaceCast(
1841 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_reduce_list");
1847 LocalReduceList, RemoteReduceList,
1848 {RemoteLaneOffsetArgVal,
1873 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1875 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1876 auto CondAlgo1 = Bld.CreateAnd(
1877 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1879 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1880 auto CondAlgo2 = Bld.CreateAnd(
1882 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1884 CondAlgo2 = Bld.CreateAnd(
1885 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1887 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1888 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1893 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1897 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1898 LocalReduceList.getPointer(), CGF.
VoidPtrTy);
1899 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1902 Bld.CreateBr(MergeBB);
1905 Bld.CreateBr(MergeBB);
1911 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1912 auto CondCopy = Bld.CreateAnd(
1913 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1918 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1922 RemoteReduceList, LocalReduceList);
1923 Bld.CreateBr(CpyMergeBB);
1926 Bld.CreateBr(CpyMergeBB);
2186 assert((TeamsReduction || ParallelReduction) &&
2187 "Invalid reduction selection in emitReduction.");
2193 auto Size = RHSExprs.size();
2194 for (
auto *E : Privates) {
2195 if (E->getType()->isVariablyModifiedType())
2199 llvm::APInt ArraySize(32, Size);
2204 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2205 auto IPriv = Privates.begin();
2207 for (
unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2214 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2232 LHSExprs, RHSExprs, ReductionOps);
2237 auto *ReductionArrayTySize = CGF.
getTypeSize(ReductionArrayTy);
2242 CGM, Privates, ReductionArrayTy, ReductionFn);
2243 auto *InterWarpCopyFn =
2247 if (ParallelReduction) {
2249 CGF.
Builder.getInt32(RHSExprs.size()),
2250 ReductionArrayTySize,
2260 if (TeamsReduction) {
2261 auto *ScratchPadCopyFn =
2264 CGM, Privates, ReductionArrayTy, ReductionFn);
2267 CGF.
Builder.getInt32(RHSExprs.size()),
2268 ReductionArrayTySize,
2281 auto *SwInst = CGF.
Builder.CreateSwitch(Res, DefaultBB, 1);
2288 SwInst->addCase(CGF.
Builder.getInt32(1), Case1BB);
2293 auto &&CodeGen = [&
Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2295 auto IPriv = Privates.begin();
2296 auto ILHS = LHSExprs.begin();
2297 auto IRHS = RHSExprs.begin();
2298 for (
auto *E : ReductionOps) {
2300 cast<DeclRefExpr>(*IRHS));
2307 NVPTXActionTy Action(
2313 CGF.EmitBranch(DefaultBB);
2314 CGF.EmitBlock(DefaultBB,
true);
2319 const VarDecl *NativeParam)
const {
2324 const Type *NonQualTy = QC.
strip(ArgType);
2325 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2326 if (
const auto *
Attr = FD->
getAttr<OMPCaptureKindAttr>()) {
2327 if (
Attr->getCaptureKind() == OMPC_map) {
2334 enum { NVPTX_local_addr = 5 };
2337 if (isa<ImplicitParamDecl>(NativeParam)) {
2353 const VarDecl *TargetParam)
const {
2354 assert(NativeParam != TargetParam &&
2356 "Native arg must not be the same as target arg.");
2360 const Type *NonQualTy = QC.
strip(NativeParamType);
2361 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2362 unsigned NativePointeeAddrSpace =
2369 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2373 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2374 NativePointeeAddrSpace));
2378 return NativeParamAddr;
2385 TargetArgs.reserve(Args.size());
2387 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2388 for (
unsigned I = 0, E = Args.size(); I < E; ++I) {
2389 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2390 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2393 llvm::Type *TargetType = FnType->getParamType(I);
2395 if (!TargetType->isPointerTy()) {
2396 TargetArgs.emplace_back(NativeArg);
2400 NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
2402 TargetArgs.emplace_back(
2412 llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
2429 WrapperArgs.emplace_back(&ParallelLevelArg);
2430 WrapperArgs.emplace_back(&WrapperArg);
2431 WrapperArgs.emplace_back(&SharedArgsList);
2438 OutlinedParallelFn->getName() +
"_wrapper", &CGM.
getModule());
2445 const auto *RD = CS.getCapturedRecordDecl();
2446 auto CurField = RD->field_begin();
2452 Args.emplace_back(llvm::ConstantPointerNull::get(
2453 CGM.
Int32Ty->getPointerTo()));
2454 Args.emplace_back(llvm::ConstantPointerNull::get(
2455 CGM.
Int32Ty->getPointerTo()));
2458 auto CI = CS.capture_begin();
2466 for (
unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {
2469 if (CI->capturesThis())
2472 Name = CI->getCapturedVar()->getName();
2476 QualType ElemTy = CurField->getType();
2479 if (CI->capturesVariableByCopy())
2489 Args.emplace_back(Arg);
2492 emitCall(CGF, OutlinedParallelFn, Args);
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
llvm::PointerType * Int8PtrPtrTy
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Other implicit parameter.
PointerType - C99 6.7.5.1 - Pointer Declarators.
A (possibly-)qualified type.
CodeGenTypes & getTypes()
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier #ID to synchronize selected (multiple of warp size) threads in a CTA. ...
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
static CGOpenMPRuntimeNVPTX::ExecutionMode getExecutionModeForDirective(CodeGenModule &CGM, const OMPExecutableDirective &D)
CharUnits getPointerSize() const
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
llvm::Value * ScratchpadIndex
The base class of the type hierarchy.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
llvm::CallSite EmitCallOrInvoke(llvm::Value *Callee, ArrayRef< llvm::Value *> Args, const Twine &Name="")
Emits a call or invoke instruction to the given function, depending on the current state of the EH st...
llvm::Value * emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
VarDecl - An instance of this class is created to represent a variable declaration or definition...
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
OpenMPDirectiveKind ReductionKind
llvm::Value * getPointer() const
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
IdentifierInfo * getIdentifier() const
getIdentifier - Get the identifier that names this declaration, if there is one.
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
llvm::IntegerType * Int64Ty
FieldDecl - An instance of this class is created by Sema::ActOnField to represent a member of a struc...
llvm::IntegerType * SizeTy
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
bool isReferenceType() const
OpenMPDirectiveKind getDirectiveKind() const
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
llvm::PointerType * VoidPtrTy
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
CharUnits getPointerAlign() const
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
ASTContext & getContext() const
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Expr - This represents one expression.
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type 'void ()'.
const T * castAs() const
Member-template castAs<specific type>.
void SetInternalFunctionAttributes(const Decl *D, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::PointerType * getType() const
Return the type of the pointer value.
DeclContext * getDeclContext()
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
llvm::IntegerType * Int32Ty
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
const LangOptions & getLangOpts() const
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for 'proc_bind' clause.
llvm::Value * ScratchpadWidth
GlobalDecl - represents a global declaration.
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP 'if' clause using specified CodeGen function.
Encodes a location in the source.
This is a basic class for representing single OpenMP executable directive.
llvm::IntegerType * Int16Ty
OpenMPDirectiveKind
OpenMP directives.
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
This file defines OpenMP nodes for declarative directives.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
void emitCall(CodeGenFunction &CGF, llvm::Value *Callee, ArrayRef< llvm::Value *> Args=llvm::None, SourceLocation Loc=SourceLocation()) const
Emits Callee function call with arguments Args with location Loc.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSpmdExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
void setAction(PrePostActionTy &Action) const
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
A qualifier set is used to build a set of qualifiers.
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, CGOpenMPRuntimeNVPTX::ExecutionMode Mode)
SourceLocation getLocStart() const LLVM_READONLY
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, QualType ElemTy, llvm::Value *Elem, llvm::Value *Offset)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
llvm::PointerType * Int8PtrTy
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
ExecutionMode
Target codegen is specialized based on two programming models: the 'generic' fork-join model of OpenM...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Privates[]
Gets the list of initial values for linear variables.
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
std::pair< llvm::Value *, QualType > getVLASize(const VariableArrayType *vla)
getVLASize - Returns an LLVM value that corresponds to the size, in non-variably-sized elements...
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
const VariableArrayType * getAsVariableArrayType(QualType T) const
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
llvm::Value * RemoteLaneOffset
CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
void addAddressSpace(LangAS space)
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
unsigned getTargetAddressSpace(QualType T) const
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Address CreateMemTemp(QualType T, const Twine &Name="tmp", bool CastToDefaultAddrSpace=true)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignment...
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
llvm::Value * getPointer() const
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
SourceLocation getLocation() const
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.