clang  8.0.0
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "clang/Basic/Cuda.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 
23 using namespace clang;
24 using namespace CodeGen;
25 
26 namespace {
28  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
29  /// int16_t RequiresOMPRuntime);
30  OMPRTL_NVPTX__kmpc_kernel_init,
31  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
32  OMPRTL_NVPTX__kmpc_kernel_deinit,
33  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
34  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
35  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
36  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
37  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
38  /// Call to void __kmpc_kernel_prepare_parallel(void
39  /// *outlined_function, int16_t
40  /// IsOMPRuntimeInitialized);
41  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
42  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
43  /// int16_t IsOMPRuntimeInitialized);
44  OMPRTL_NVPTX__kmpc_kernel_parallel,
45  /// Call to void __kmpc_kernel_end_parallel();
46  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
47  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
48  /// global_tid);
49  OMPRTL_NVPTX__kmpc_serialized_parallel,
50  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
51  /// global_tid);
52  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
53  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
54  /// int16_t lane_offset, int16_t warp_size);
55  OMPRTL_NVPTX__kmpc_shuffle_int32,
56  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
57  /// int16_t lane_offset, int16_t warp_size);
58  OMPRTL_NVPTX__kmpc_shuffle_int64,
59  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
60  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
61  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
62  /// lane_offset, int16_t shortCircuit),
63  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
64  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2,
65  /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32
66  /// global_tid, kmp_critical_name *lck)
67  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple,
68  /// Call to __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc,
69  /// kmp_int32 global_tid, kmp_critical_name *lck)
70  OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple,
71  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
72  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
73  /// Call to void __kmpc_data_sharing_init_stack();
74  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
75  /// Call to void __kmpc_data_sharing_init_stack_spmd();
76  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
77  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
78  /// int16_t UseSharedMemory);
79  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
80  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
81  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
82  /// Call to void __kmpc_begin_sharing_variables(void ***args,
83  /// size_t n_args);
84  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
85  /// Call to void __kmpc_end_sharing_variables();
86  OMPRTL_NVPTX__kmpc_end_sharing_variables,
87  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
88  OMPRTL_NVPTX__kmpc_get_shared_variables,
89  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
90  /// global_tid);
91  OMPRTL_NVPTX__kmpc_parallel_level,
92  /// Call to int8_t __kmpc_is_spmd_exec_mode();
93  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
94  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
95  /// const void *buf, size_t size, int16_t is_shared, const void **res);
96  OMPRTL_NVPTX__kmpc_get_team_static_memory,
97  /// Call to void __kmpc_restore_team_static_memory(int16_t
98  /// isSPMDExecutionMode, int16_t is_shared);
99  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
100  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
102  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
103  /// global_tid);
104  OMPRTL__kmpc_barrier_simple_spmd,
105 };
106 
107 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
108 class NVPTXActionTy final : public PrePostActionTy {
109  llvm::Value *EnterCallee = nullptr;
110  ArrayRef<llvm::Value *> EnterArgs;
111  llvm::Value *ExitCallee = nullptr;
112  ArrayRef<llvm::Value *> ExitArgs;
113  bool Conditional = false;
114  llvm::BasicBlock *ContBlock = nullptr;
115 
116 public:
117  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
118  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
119  bool Conditional = false)
120  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
121  ExitArgs(ExitArgs), Conditional(Conditional) {}
122  void Enter(CodeGenFunction &CGF) override {
123  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
124  if (Conditional) {
125  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
126  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
127  ContBlock = CGF.createBasicBlock("omp_if.end");
128  // Generate the branch (If-stmt)
129  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
130  CGF.EmitBlock(ThenBlock);
131  }
132  }
133  void Done(CodeGenFunction &CGF) {
134  // Emit the rest of blocks/branches
135  CGF.EmitBranch(ContBlock);
136  CGF.EmitBlock(ContBlock, true);
137  }
138  void Exit(CodeGenFunction &CGF) override {
139  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
140  }
141 };
142 
143 /// A class to track the execution mode when codegening directives within
144 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
145 /// to the target region and used by containing directives such as 'parallel'
146 /// to emit optimized code.
147 class ExecutionRuntimeModesRAII {
148 private:
152  bool SavedRuntimeMode = false;
153  bool *RuntimeMode = nullptr;
154 
155 public:
156  /// Constructor for Non-SPMD mode.
157  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
158  : ExecMode(ExecMode) {
159  SavedExecMode = ExecMode;
161  }
162  /// Constructor for SPMD mode.
163  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
164  bool &RuntimeMode, bool FullRuntimeMode)
165  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
166  SavedExecMode = ExecMode;
167  SavedRuntimeMode = RuntimeMode;
169  RuntimeMode = FullRuntimeMode;
170  }
171  ~ExecutionRuntimeModesRAII() {
172  ExecMode = SavedExecMode;
173  if (RuntimeMode)
174  *RuntimeMode = SavedRuntimeMode;
175  }
176 };
177 
178 /// GPU Configuration: This information can be derived from cuda registers,
179 /// however, providing compile time constants helps generate more efficient
180 /// code. For all practical purposes this is fine because the configuration
181 /// is the same for all known NVPTX architectures.
182 enum MachineConfiguration : unsigned {
183  WarpSize = 32,
184  /// Number of bits required to represent a lane identifier, which is
185  /// computed as log_2(WarpSize).
186  LaneIDBits = 5,
187  LaneIDMask = WarpSize - 1,
188 
189  /// Global memory alignment for performance.
190  GlobalMemoryAlignment = 128,
191 
192  /// Maximal size of the shared memory buffer.
193  SharedMemorySize = 128,
194 };
195 
196 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
197  RefExpr = RefExpr->IgnoreParens();
198  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
199  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
200  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
201  Base = TempASE->getBase()->IgnoreParenImpCasts();
202  RefExpr = Base;
203  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
204  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
205  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
206  Base = TempOASE->getBase()->IgnoreParenImpCasts();
207  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
208  Base = TempASE->getBase()->IgnoreParenImpCasts();
209  RefExpr = Base;
210  }
211  RefExpr = RefExpr->IgnoreParenImpCasts();
212  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
213  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
214  const auto *ME = cast<MemberExpr>(RefExpr);
215  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
216 }
217 
218 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
219 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
220  return P1.first > P2.first;
221 }
222 
223 static RecordDecl *buildRecordForGlobalizedVars(
224  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
225  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
226  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
227  &MappedDeclsFields) {
228  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
229  return nullptr;
230  SmallVector<VarsDataTy, 4> GlobalizedVars;
231  for (const ValueDecl *D : EscapedDecls)
232  GlobalizedVars.emplace_back(
234  C.getDeclAlign(D).getQuantity(),
235  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
236  D);
237  for (const ValueDecl *D : EscapedDeclsForTeams)
238  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
239  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
241  // Build struct _globalized_locals_ty {
242  // /* globalized vars */[WarSize] align (max(decl_align,
243  // GlobalMemoryAlignment))
244  // /* globalized vars */ for EscapedDeclsForTeams
245  // };
246  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
247  GlobalizedRD->startDefinition();
248  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
249  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
250  for (const auto &Pair : GlobalizedVars) {
251  const ValueDecl *VD = Pair.second;
252  QualType Type = VD->getType();
253  if (Type->isLValueReferenceType())
254  Type = C.getPointerType(Type.getNonReferenceType());
255  else
256  Type = Type.getNonReferenceType();
257  SourceLocation Loc = VD->getLocation();
258  FieldDecl *Field;
259  if (SingleEscaped.count(VD)) {
260  Field = FieldDecl::Create(
261  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
263  /*BW=*/nullptr, /*Mutable=*/false,
264  /*InitStyle=*/ICIS_NoInit);
265  Field->setAccess(AS_public);
266  if (VD->hasAttrs()) {
267  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
268  E(VD->getAttrs().end());
269  I != E; ++I)
270  Field->addAttr(*I);
271  }
272  } else {
273  llvm::APInt ArraySize(32, WarpSize);
274  Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
275  Field = FieldDecl::Create(
276  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
278  /*BW=*/nullptr, /*Mutable=*/false,
279  /*InitStyle=*/ICIS_NoInit);
280  Field->setAccess(AS_public);
281  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
282  static_cast<CharUnits::QuantityType>(
283  GlobalMemoryAlignment)));
284  Field->addAttr(AlignedAttr::CreateImplicit(
285  C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
286  IntegerLiteral::Create(C, Align,
287  C.getIntTypeForBitwidth(32, /*Signed=*/0),
288  SourceLocation())));
289  }
290  GlobalizedRD->addDecl(Field);
291  MappedDeclsFields.try_emplace(VD, Field);
292  }
293  GlobalizedRD->completeDefinition();
294  return GlobalizedRD;
295 }
296 
297 /// Get the list of variables that can escape their declaration context.
298 class CheckVarsEscapingDeclContext final
299  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
300  CodeGenFunction &CGF;
301  llvm::SetVector<const ValueDecl *> EscapedDecls;
302  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
303  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
304  RecordDecl *GlobalizedRD = nullptr;
305  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
306  bool AllEscaped = false;
307  bool IsForCombinedParallelRegion = false;
308 
309  void markAsEscaped(const ValueDecl *VD) {
310  // Do not globalize declare target variables.
311  if (!isa<VarDecl>(VD) ||
312  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
313  return;
314  VD = cast<ValueDecl>(VD->getCanonicalDecl());
315  // Variables captured by value must be globalized.
316  if (auto *CSI = CGF.CapturedStmtInfo) {
317  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
318  // Check if need to capture the variable that was already captured by
319  // value in the outer region.
320  if (!IsForCombinedParallelRegion) {
321  if (!FD->hasAttrs())
322  return;
323  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
324  if (!Attr)
325  return;
326  if (((Attr->getCaptureKind() != OMPC_map) &&
328  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
329  ((Attr->getCaptureKind() == OMPC_map) &&
330  !FD->getType()->isAnyPointerType()))
331  return;
332  }
333  if (!FD->getType()->isReferenceType()) {
334  assert(!VD->getType()->isVariablyModifiedType() &&
335  "Parameter captured by value with variably modified type");
336  EscapedParameters.insert(VD);
337  } else if (!IsForCombinedParallelRegion) {
338  return;
339  }
340  }
341  }
342  if ((!CGF.CapturedStmtInfo ||
343  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
344  VD->getType()->isReferenceType())
345  // Do not globalize variables with reference type.
346  return;
347  if (VD->getType()->isVariablyModifiedType())
348  EscapedVariableLengthDecls.insert(VD);
349  else
350  EscapedDecls.insert(VD);
351  }
352 
353  void VisitValueDecl(const ValueDecl *VD) {
354  if (VD->getType()->isLValueReferenceType())
355  markAsEscaped(VD);
356  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
357  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
358  const bool SavedAllEscaped = AllEscaped;
359  AllEscaped = VD->getType()->isLValueReferenceType();
360  Visit(VarD->getInit());
361  AllEscaped = SavedAllEscaped;
362  }
363  }
364  }
365  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
366  ArrayRef<OMPClause *> Clauses,
367  bool IsCombinedParallelRegion) {
368  if (!S)
369  return;
370  for (const CapturedStmt::Capture &C : S->captures()) {
371  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372  const ValueDecl *VD = C.getCapturedVar();
373  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
374  if (IsCombinedParallelRegion) {
375  // Check if the variable is privatized in the combined construct and
376  // those private copies must be shared in the inner parallel
377  // directive.
378  IsForCombinedParallelRegion = false;
379  for (const OMPClause *C : Clauses) {
380  if (!isOpenMPPrivate(C->getClauseKind()) ||
381  C->getClauseKind() == OMPC_reduction ||
382  C->getClauseKind() == OMPC_linear ||
383  C->getClauseKind() == OMPC_private)
384  continue;
386  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
387  Vars = PC->getVarRefs();
388  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
389  Vars = PC->getVarRefs();
390  else
391  llvm_unreachable("Unexpected clause.");
392  for (const auto *E : Vars) {
393  const Decl *D =
394  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
395  if (D == VD->getCanonicalDecl()) {
396  IsForCombinedParallelRegion = true;
397  break;
398  }
399  }
400  if (IsForCombinedParallelRegion)
401  break;
402  }
403  }
404  markAsEscaped(VD);
405  if (isa<OMPCapturedExprDecl>(VD))
406  VisitValueDecl(VD);
407  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
408  }
409  }
410  }
411 
412  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
413  assert(!GlobalizedRD &&
414  "Record for globalized variables is built already.");
415  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
416  if (IsInTTDRegion)
417  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
418  else
419  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
420  GlobalizedRD = ::buildRecordForGlobalizedVars(
421  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
422  MappedDeclsFields);
423  }
424 
425 public:
426  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
427  ArrayRef<const ValueDecl *> TeamsReductions)
428  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
429  }
430  virtual ~CheckVarsEscapingDeclContext() = default;
431  void VisitDeclStmt(const DeclStmt *S) {
432  if (!S)
433  return;
434  for (const Decl *D : S->decls())
435  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
436  VisitValueDecl(VD);
437  }
438  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
439  if (!D)
440  return;
441  if (!D->hasAssociatedStmt())
442  return;
443  if (const auto *S =
444  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
445  // Do not analyze directives that do not actually require capturing,
446  // like `omp for` or `omp simd` directives.
448  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
449  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
450  VisitStmt(S->getCapturedStmt());
451  return;
452  }
453  VisitOpenMPCapturedStmt(
454  S, D->clauses(),
455  CaptureRegions.back() == OMPD_parallel &&
457  }
458  }
459  void VisitCapturedStmt(const CapturedStmt *S) {
460  if (!S)
461  return;
462  for (const CapturedStmt::Capture &C : S->captures()) {
463  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
464  const ValueDecl *VD = C.getCapturedVar();
465  markAsEscaped(VD);
466  if (isa<OMPCapturedExprDecl>(VD))
467  VisitValueDecl(VD);
468  }
469  }
470  }
471  void VisitLambdaExpr(const LambdaExpr *E) {
472  if (!E)
473  return;
474  for (const LambdaCapture &C : E->captures()) {
475  if (C.capturesVariable()) {
476  if (C.getCaptureKind() == LCK_ByRef) {
477  const ValueDecl *VD = C.getCapturedVar();
478  markAsEscaped(VD);
479  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
480  VisitValueDecl(VD);
481  }
482  }
483  }
484  }
485  void VisitBlockExpr(const BlockExpr *E) {
486  if (!E)
487  return;
488  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
489  if (C.isByRef()) {
490  const VarDecl *VD = C.getVariable();
491  markAsEscaped(VD);
492  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
493  VisitValueDecl(VD);
494  }
495  }
496  }
497  void VisitCallExpr(const CallExpr *E) {
498  if (!E)
499  return;
500  for (const Expr *Arg : E->arguments()) {
501  if (!Arg)
502  continue;
503  if (Arg->isLValue()) {
504  const bool SavedAllEscaped = AllEscaped;
505  AllEscaped = true;
506  Visit(Arg);
507  AllEscaped = SavedAllEscaped;
508  } else {
509  Visit(Arg);
510  }
511  }
512  Visit(E->getCallee());
513  }
514  void VisitDeclRefExpr(const DeclRefExpr *E) {
515  if (!E)
516  return;
517  const ValueDecl *VD = E->getDecl();
518  if (AllEscaped)
519  markAsEscaped(VD);
520  if (isa<OMPCapturedExprDecl>(VD))
521  VisitValueDecl(VD);
522  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
523  if (VarD->isInitCapture())
524  VisitValueDecl(VD);
525  }
526  void VisitUnaryOperator(const UnaryOperator *E) {
527  if (!E)
528  return;
529  if (E->getOpcode() == UO_AddrOf) {
530  const bool SavedAllEscaped = AllEscaped;
531  AllEscaped = true;
532  Visit(E->getSubExpr());
533  AllEscaped = SavedAllEscaped;
534  } else {
535  Visit(E->getSubExpr());
536  }
537  }
538  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
539  if (!E)
540  return;
541  if (E->getCastKind() == CK_ArrayToPointerDecay) {
542  const bool SavedAllEscaped = AllEscaped;
543  AllEscaped = true;
544  Visit(E->getSubExpr());
545  AllEscaped = SavedAllEscaped;
546  } else {
547  Visit(E->getSubExpr());
548  }
549  }
550  void VisitExpr(const Expr *E) {
551  if (!E)
552  return;
553  bool SavedAllEscaped = AllEscaped;
554  if (!E->isLValue())
555  AllEscaped = false;
556  for (const Stmt *Child : E->children())
557  if (Child)
558  Visit(Child);
559  AllEscaped = SavedAllEscaped;
560  }
561  void VisitStmt(const Stmt *S) {
562  if (!S)
563  return;
564  for (const Stmt *Child : S->children())
565  if (Child)
566  Visit(Child);
567  }
568 
569  /// Returns the record that handles all the escaped local variables and used
570  /// instead of their original storage.
571  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
572  if (!GlobalizedRD)
573  buildRecordForGlobalizedVars(IsInTTDRegion);
574  return GlobalizedRD;
575  }
576 
577  /// Returns the field in the globalized record for the escaped variable.
578  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
579  assert(GlobalizedRD &&
580  "Record for globalized variables must be generated already.");
581  auto I = MappedDeclsFields.find(VD);
582  if (I == MappedDeclsFields.end())
583  return nullptr;
584  return I->getSecond();
585  }
586 
587  /// Returns the list of the escaped local variables/parameters.
588  ArrayRef<const ValueDecl *> getEscapedDecls() const {
589  return EscapedDecls.getArrayRef();
590  }
591 
592  /// Checks if the escaped local variable is actually a parameter passed by
593  /// value.
594  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
595  return EscapedParameters;
596  }
597 
598  /// Returns the list of the escaped variables with the variably modified
599  /// types.
600  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
601  return EscapedVariableLengthDecls.getArrayRef();
602  }
603 };
604 } // anonymous namespace
605 
606 /// Get the GPU warp size.
608  return CGF.EmitRuntimeCall(
609  llvm::Intrinsic::getDeclaration(
610  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
611  "nvptx_warp_size");
612 }
613 
614 /// Get the id of the current thread on the GPU.
616  return CGF.EmitRuntimeCall(
617  llvm::Intrinsic::getDeclaration(
618  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
619  "nvptx_tid");
620 }
621 
622 /// Get the id of the warp in the block.
623 /// We assume that the warp size is 32, which is always the case
624 /// on the NVPTX device, to generate more efficient code.
626  CGBuilderTy &Bld = CGF.Builder;
627  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
628 }
629 
630 /// Get the id of the current lane in the Warp.
631 /// We assume that the warp size is 32, which is always the case
632 /// on the NVPTX device, to generate more efficient code.
634  CGBuilderTy &Bld = CGF.Builder;
635  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
636  "nvptx_lane_id");
637 }
638 
639 /// Get the maximum number of threads in a block of the GPU.
641  return CGF.EmitRuntimeCall(
642  llvm::Intrinsic::getDeclaration(
643  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
644  "nvptx_num_threads");
645 }
646 
647 /// Get the value of the thread_limit clause in the teams directive.
648 /// For the 'generic' execution mode, the runtime encodes thread_limit in
649 /// the launch parameters, always starting thread_limit+warpSize threads per
650 /// CTA. The threads in the last warp are reserved for master execution.
651 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
653  bool IsInSPMDExecutionMode = false) {
654  CGBuilderTy &Bld = CGF.Builder;
655  return IsInSPMDExecutionMode
656  ? getNVPTXNumThreads(CGF)
657  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
658  "thread_limit");
659 }
660 
661 /// Get the thread id of the OMP master thread.
662 /// The master thread id is the first thread (lane) of the last warp in the
663 /// GPU block. Warp size is assumed to be some power of 2.
664 /// Thread id is 0 indexed.
665 /// E.g: If NumThreads is 33, master id is 32.
666 /// If NumThreads is 64, master id is 32.
667 /// If NumThreads is 1024, master id is 992.
669  CGBuilderTy &Bld = CGF.Builder;
670  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
671 
672  // We assume that the warp size is a power of 2.
673  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
674 
675  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
676  Bld.CreateNot(Mask), "master_tid");
677 }
678 
679 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
680  CodeGenModule &CGM, SourceLocation Loc)
681  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
682  Loc(Loc) {
683  createWorkerFunction(CGM);
684 }
685 
686 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
687  CodeGenModule &CGM) {
688  // Create an worker function with no arguments.
689 
690  WorkerFn = llvm::Function::Create(
692  /*placeholder=*/"_worker", &CGM.getModule());
693  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
694  WorkerFn->setDoesNotRecurse();
695 }
696 
698 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
699  return CurrentExecutionMode;
700 }
701 
704  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
706 }
707 
708 /// Checks if the expression is constant or does not have non-trivial function
709 /// calls.
710 static bool isTrivial(ASTContext &Ctx, const Expr * E) {
711  // We can skip constant expressions.
712  // We can skip expressions with trivial calls or simple expressions.
714  !E->hasNonTrivialCall(Ctx)) &&
715  !E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true);
716 }
717 
718 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
719 /// iff there is only one that is not evaluatable at the compile time.
720 static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) {
721  if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
722  const Stmt *Child = nullptr;
723  for (const Stmt *S : C->body()) {
724  if (const auto *E = dyn_cast<Expr>(S)) {
725  if (isTrivial(Ctx, E))
726  continue;
727  }
728  // Some of the statements can be ignored.
729  if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
730  isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
731  continue;
732  // Analyze declarations.
733  if (const auto *DS = dyn_cast<DeclStmt>(S)) {
734  if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
735  if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
736  isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
737  isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
738  isa<UsingDirectiveDecl>(D) ||
739  isa<OMPDeclareReductionDecl>(D) ||
740  isa<OMPThreadPrivateDecl>(D))
741  return true;
742  const auto *VD = dyn_cast<VarDecl>(D);
743  if (!VD)
744  return false;
745  return VD->isConstexpr() ||
746  ((VD->getType().isTrivialType(Ctx) ||
747  VD->getType()->isReferenceType()) &&
748  (!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
749  }))
750  continue;
751  }
752  // Found multiple children - cannot get the one child only.
753  if (Child)
754  return Body;
755  Child = S;
756  }
757  if (Child)
758  return Child;
759  }
760  return Body;
761 }
762 
763 /// Check if the parallel directive has an 'if' clause with non-constant or
764 /// false condition. Also, check if the number of threads is strictly specified
765 /// and run those directives in non-SPMD mode.
767  const OMPExecutableDirective &D) {
769  return true;
770  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
771  OpenMPDirectiveKind NameModifier = C->getNameModifier();
772  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
773  continue;
774  const Expr *Cond = C->getCondition();
775  bool Result;
776  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
777  return true;
778  }
779  return false;
780 }
781 
782 /// Check for inner (nested) SPMD construct, if any
784  const OMPExecutableDirective &D) {
785  const auto *CS = D.getInnermostCapturedStmt();
786  const auto *Body =
787  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
788  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
789 
790  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
791  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
792  switch (D.getDirectiveKind()) {
793  case OMPD_target:
794  if (isOpenMPParallelDirective(DKind) &&
795  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
796  return true;
797  if (DKind == OMPD_teams) {
798  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
799  /*IgnoreCaptured=*/true);
800  if (!Body)
801  return false;
802  ChildStmt = getSingleCompoundChild(Ctx, Body);
803  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
804  DKind = NND->getDirectiveKind();
805  if (isOpenMPParallelDirective(DKind) &&
806  !hasParallelIfNumThreadsClause(Ctx, *NND))
807  return true;
808  }
809  }
810  return false;
811  case OMPD_target_teams:
812  return isOpenMPParallelDirective(DKind) &&
813  !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
814  case OMPD_target_simd:
815  case OMPD_target_parallel:
816  case OMPD_target_parallel_for:
817  case OMPD_target_parallel_for_simd:
818  case OMPD_target_teams_distribute:
819  case OMPD_target_teams_distribute_simd:
820  case OMPD_target_teams_distribute_parallel_for:
821  case OMPD_target_teams_distribute_parallel_for_simd:
822  case OMPD_parallel:
823  case OMPD_for:
824  case OMPD_parallel_for:
825  case OMPD_parallel_sections:
826  case OMPD_for_simd:
827  case OMPD_parallel_for_simd:
828  case OMPD_cancel:
829  case OMPD_cancellation_point:
830  case OMPD_ordered:
831  case OMPD_threadprivate:
832  case OMPD_task:
833  case OMPD_simd:
834  case OMPD_sections:
835  case OMPD_section:
836  case OMPD_single:
837  case OMPD_master:
838  case OMPD_critical:
839  case OMPD_taskyield:
840  case OMPD_barrier:
841  case OMPD_taskwait:
842  case OMPD_taskgroup:
843  case OMPD_atomic:
844  case OMPD_flush:
845  case OMPD_teams:
846  case OMPD_target_data:
847  case OMPD_target_exit_data:
848  case OMPD_target_enter_data:
849  case OMPD_distribute:
850  case OMPD_distribute_simd:
851  case OMPD_distribute_parallel_for:
852  case OMPD_distribute_parallel_for_simd:
853  case OMPD_teams_distribute:
854  case OMPD_teams_distribute_simd:
855  case OMPD_teams_distribute_parallel_for:
856  case OMPD_teams_distribute_parallel_for_simd:
857  case OMPD_target_update:
858  case OMPD_declare_simd:
859  case OMPD_declare_target:
860  case OMPD_end_declare_target:
861  case OMPD_declare_reduction:
862  case OMPD_taskloop:
863  case OMPD_taskloop_simd:
864  case OMPD_requires:
865  case OMPD_unknown:
866  llvm_unreachable("Unexpected directive.");
867  }
868  }
869 
870  return false;
871 }
872 
874  const OMPExecutableDirective &D) {
875  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
876  switch (DirectiveKind) {
877  case OMPD_target:
878  case OMPD_target_teams:
879  return hasNestedSPMDDirective(Ctx, D);
880  case OMPD_target_parallel:
881  case OMPD_target_parallel_for:
882  case OMPD_target_parallel_for_simd:
883  case OMPD_target_teams_distribute_parallel_for:
884  case OMPD_target_teams_distribute_parallel_for_simd:
885  return !hasParallelIfNumThreadsClause(Ctx, D);
886  case OMPD_target_simd:
887  case OMPD_target_teams_distribute:
888  case OMPD_target_teams_distribute_simd:
889  return false;
890  case OMPD_parallel:
891  case OMPD_for:
892  case OMPD_parallel_for:
893  case OMPD_parallel_sections:
894  case OMPD_for_simd:
895  case OMPD_parallel_for_simd:
896  case OMPD_cancel:
897  case OMPD_cancellation_point:
898  case OMPD_ordered:
899  case OMPD_threadprivate:
900  case OMPD_task:
901  case OMPD_simd:
902  case OMPD_sections:
903  case OMPD_section:
904  case OMPD_single:
905  case OMPD_master:
906  case OMPD_critical:
907  case OMPD_taskyield:
908  case OMPD_barrier:
909  case OMPD_taskwait:
910  case OMPD_taskgroup:
911  case OMPD_atomic:
912  case OMPD_flush:
913  case OMPD_teams:
914  case OMPD_target_data:
915  case OMPD_target_exit_data:
916  case OMPD_target_enter_data:
917  case OMPD_distribute:
918  case OMPD_distribute_simd:
919  case OMPD_distribute_parallel_for:
920  case OMPD_distribute_parallel_for_simd:
921  case OMPD_teams_distribute:
922  case OMPD_teams_distribute_simd:
923  case OMPD_teams_distribute_parallel_for:
924  case OMPD_teams_distribute_parallel_for_simd:
925  case OMPD_target_update:
926  case OMPD_declare_simd:
927  case OMPD_declare_target:
928  case OMPD_end_declare_target:
929  case OMPD_declare_reduction:
930  case OMPD_taskloop:
931  case OMPD_taskloop_simd:
932  case OMPD_requires:
933  case OMPD_unknown:
934  break;
935  }
936  llvm_unreachable(
937  "Unknown programming model for OpenMP directive on NVPTX target.");
938 }
939 
940 /// Check if the directive is loops based and has schedule clause at all or has
941 /// static scheduling.
945  "Expected loop-based directive.");
946  return !D.hasClausesOfKind<OMPOrderedClause>() &&
948  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
949  [](const OMPScheduleClause *C) {
950  return C->getScheduleKind() == OMPC_SCHEDULE_static;
951  }));
952 }
953 
954 /// Check for inner (nested) lightweight runtime construct, if any
956  const OMPExecutableDirective &D) {
957  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
958  const auto *CS = D.getInnermostCapturedStmt();
959  const auto *Body =
960  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
961  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
962 
963  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
964  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
965  switch (D.getDirectiveKind()) {
966  case OMPD_target:
967  if (isOpenMPParallelDirective(DKind) &&
969  hasStaticScheduling(*NestedDir))
970  return true;
971  if (DKind == OMPD_parallel) {
972  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
973  /*IgnoreCaptured=*/true);
974  if (!Body)
975  return false;
976  ChildStmt = getSingleCompoundChild(Ctx, Body);
977  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
978  DKind = NND->getDirectiveKind();
979  if (isOpenMPWorksharingDirective(DKind) &&
981  return true;
982  }
983  } else if (DKind == OMPD_teams) {
984  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
985  /*IgnoreCaptured=*/true);
986  if (!Body)
987  return false;
988  ChildStmt = getSingleCompoundChild(Ctx, Body);
989  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
990  DKind = NND->getDirectiveKind();
991  if (isOpenMPParallelDirective(DKind) &&
994  return true;
995  if (DKind == OMPD_parallel) {
996  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
997  /*IgnoreCaptured=*/true);
998  if (!Body)
999  return false;
1000  ChildStmt = getSingleCompoundChild(Ctx, Body);
1001  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1002  DKind = NND->getDirectiveKind();
1003  if (isOpenMPWorksharingDirective(DKind) &&
1005  return true;
1006  }
1007  }
1008  }
1009  }
1010  return false;
1011  case OMPD_target_teams:
1012  if (isOpenMPParallelDirective(DKind) &&
1014  hasStaticScheduling(*NestedDir))
1015  return true;
1016  if (DKind == OMPD_parallel) {
1017  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1018  /*IgnoreCaptured=*/true);
1019  if (!Body)
1020  return false;
1021  ChildStmt = getSingleCompoundChild(Ctx, Body);
1022  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1023  DKind = NND->getDirectiveKind();
1024  if (isOpenMPWorksharingDirective(DKind) &&
1026  return true;
1027  }
1028  }
1029  return false;
1030  case OMPD_target_parallel:
1031  return isOpenMPWorksharingDirective(DKind) &&
1032  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1033  case OMPD_target_teams_distribute:
1034  case OMPD_target_simd:
1035  case OMPD_target_parallel_for:
1036  case OMPD_target_parallel_for_simd:
1037  case OMPD_target_teams_distribute_simd:
1038  case OMPD_target_teams_distribute_parallel_for:
1039  case OMPD_target_teams_distribute_parallel_for_simd:
1040  case OMPD_parallel:
1041  case OMPD_for:
1042  case OMPD_parallel_for:
1043  case OMPD_parallel_sections:
1044  case OMPD_for_simd:
1045  case OMPD_parallel_for_simd:
1046  case OMPD_cancel:
1047  case OMPD_cancellation_point:
1048  case OMPD_ordered:
1049  case OMPD_threadprivate:
1050  case OMPD_task:
1051  case OMPD_simd:
1052  case OMPD_sections:
1053  case OMPD_section:
1054  case OMPD_single:
1055  case OMPD_master:
1056  case OMPD_critical:
1057  case OMPD_taskyield:
1058  case OMPD_barrier:
1059  case OMPD_taskwait:
1060  case OMPD_taskgroup:
1061  case OMPD_atomic:
1062  case OMPD_flush:
1063  case OMPD_teams:
1064  case OMPD_target_data:
1065  case OMPD_target_exit_data:
1066  case OMPD_target_enter_data:
1067  case OMPD_distribute:
1068  case OMPD_distribute_simd:
1069  case OMPD_distribute_parallel_for:
1070  case OMPD_distribute_parallel_for_simd:
1071  case OMPD_teams_distribute:
1072  case OMPD_teams_distribute_simd:
1073  case OMPD_teams_distribute_parallel_for:
1074  case OMPD_teams_distribute_parallel_for_simd:
1075  case OMPD_target_update:
1076  case OMPD_declare_simd:
1077  case OMPD_declare_target:
1078  case OMPD_end_declare_target:
1079  case OMPD_declare_reduction:
1080  case OMPD_taskloop:
1081  case OMPD_taskloop_simd:
1082  case OMPD_requires:
1083  case OMPD_unknown:
1084  llvm_unreachable("Unexpected directive.");
1085  }
1086  }
1087 
1088  return false;
1089 }
1090 
1091 /// Checks if the construct supports lightweight runtime. It must be SPMD
1092 /// construct + inner loop-based construct with static scheduling.
1094  const OMPExecutableDirective &D) {
1095  if (!supportsSPMDExecutionMode(Ctx, D))
1096  return false;
1097  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1098  switch (DirectiveKind) {
1099  case OMPD_target:
1100  case OMPD_target_teams:
1101  case OMPD_target_parallel:
1102  return hasNestedLightweightDirective(Ctx, D);
1103  case OMPD_target_parallel_for:
1104  case OMPD_target_parallel_for_simd:
1105  case OMPD_target_teams_distribute_parallel_for:
1106  case OMPD_target_teams_distribute_parallel_for_simd:
1107  // (Last|First)-privates must be shared in parallel region.
1108  return hasStaticScheduling(D);
1109  case OMPD_target_simd:
1110  case OMPD_target_teams_distribute:
1111  case OMPD_target_teams_distribute_simd:
1112  return false;
1113  case OMPD_parallel:
1114  case OMPD_for:
1115  case OMPD_parallel_for:
1116  case OMPD_parallel_sections:
1117  case OMPD_for_simd:
1118  case OMPD_parallel_for_simd:
1119  case OMPD_cancel:
1120  case OMPD_cancellation_point:
1121  case OMPD_ordered:
1122  case OMPD_threadprivate:
1123  case OMPD_task:
1124  case OMPD_simd:
1125  case OMPD_sections:
1126  case OMPD_section:
1127  case OMPD_single:
1128  case OMPD_master:
1129  case OMPD_critical:
1130  case OMPD_taskyield:
1131  case OMPD_barrier:
1132  case OMPD_taskwait:
1133  case OMPD_taskgroup:
1134  case OMPD_atomic:
1135  case OMPD_flush:
1136  case OMPD_teams:
1137  case OMPD_target_data:
1138  case OMPD_target_exit_data:
1139  case OMPD_target_enter_data:
1140  case OMPD_distribute:
1141  case OMPD_distribute_simd:
1142  case OMPD_distribute_parallel_for:
1143  case OMPD_distribute_parallel_for_simd:
1144  case OMPD_teams_distribute:
1145  case OMPD_teams_distribute_simd:
1146  case OMPD_teams_distribute_parallel_for:
1147  case OMPD_teams_distribute_parallel_for_simd:
1148  case OMPD_target_update:
1149  case OMPD_declare_simd:
1150  case OMPD_declare_target:
1151  case OMPD_end_declare_target:
1152  case OMPD_declare_reduction:
1153  case OMPD_taskloop:
1154  case OMPD_taskloop_simd:
1155  case OMPD_requires:
1156  case OMPD_unknown:
1157  break;
1158  }
1159  llvm_unreachable(
1160  "Unknown programming model for OpenMP directive on NVPTX target.");
1161 }
1162 
1163 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1164  StringRef ParentName,
1165  llvm::Function *&OutlinedFn,
1166  llvm::Constant *&OutlinedFnID,
1167  bool IsOffloadEntry,
1168  const RegionCodeGenTy &CodeGen) {
1169  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1170  EntryFunctionState EST;
1171  WorkerFunctionState WST(CGM, D.getBeginLoc());
1172  Work.clear();
1173  WrapperFunctionsMap.clear();
1174 
1175  // Emit target region as a standalone region.
1176  class NVPTXPrePostActionTy : public PrePostActionTy {
1177  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1178  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1179 
1180  public:
1181  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1182  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1183  : EST(EST), WST(WST) {}
1184  void Enter(CodeGenFunction &CGF) override {
1185  auto &RT =
1186  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1187  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1188  // Skip target region initialization.
1189  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1190  }
1191  void Exit(CodeGenFunction &CGF) override {
1192  auto &RT =
1193  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1194  RT.clearLocThreadIdInsertPt(CGF);
1195  RT.emitNonSPMDEntryFooter(CGF, EST);
1196  }
1197  } Action(EST, WST);
1198  CodeGen.setAction(Action);
1199  IsInTTDRegion = true;
1200  // Reserve place for the globalized memory.
1201  GlobalizedRecords.emplace_back();
1202  if (!KernelStaticGlobalized) {
1203  KernelStaticGlobalized = new llvm::GlobalVariable(
1204  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1206  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1207  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1208  llvm::GlobalValue::NotThreadLocal,
1210  }
1211  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1212  IsOffloadEntry, CodeGen);
1213  IsInTTDRegion = false;
1214 
1215  // Now change the name of the worker function to correspond to this target
1216  // region's entry function.
1217  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1218 
1219  // Create the worker function
1220  emitWorkerFunction(WST);
1221 }
1222 
1223 // Setup NVPTX threads for master-worker OpenMP scheme.
1224 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1225  EntryFunctionState &EST,
1226  WorkerFunctionState &WST) {
1227  CGBuilderTy &Bld = CGF.Builder;
1228 
1229  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1230  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1231  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1232  EST.ExitBB = CGF.createBasicBlock(".exit");
1233 
1234  llvm::Value *IsWorker =
1235  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1236  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1237 
1238  CGF.EmitBlock(WorkerBB);
1239  emitCall(CGF, WST.Loc, WST.WorkerFn);
1240  CGF.EmitBranch(EST.ExitBB);
1241 
1242  CGF.EmitBlock(MasterCheckBB);
1243  llvm::Value *IsMaster =
1244  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1245  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1246 
1247  CGF.EmitBlock(MasterBB);
1248  IsInTargetMasterThreadRegion = true;
1249  // SEQUENTIAL (MASTER) REGION START
1250  // First action in sequential region:
1251  // Initialize the state of the OpenMP runtime library on the GPU.
1252  // TODO: Optimize runtime initialization and pass in correct value.
1253  llvm::Value *Args[] = {getThreadLimit(CGF),
1254  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1255  CGF.EmitRuntimeCall(
1256  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1257 
1258  // For data sharing, we need to initialize the stack.
1259  CGF.EmitRuntimeCall(
1260  createNVPTXRuntimeFunction(
1261  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1262 
1263  emitGenericVarsProlog(CGF, WST.Loc);
1264 }
1265 
1266 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1267  EntryFunctionState &EST) {
1268  IsInTargetMasterThreadRegion = false;
1269  if (!CGF.HaveInsertPoint())
1270  return;
1271 
1272  emitGenericVarsEpilog(CGF);
1273 
1274  if (!EST.ExitBB)
1275  EST.ExitBB = CGF.createBasicBlock(".exit");
1276 
1277  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1278  CGF.EmitBranch(TerminateBB);
1279 
1280  CGF.EmitBlock(TerminateBB);
1281  // Signal termination condition.
1282  // TODO: Optimize runtime initialization and pass in correct value.
1283  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1284  CGF.EmitRuntimeCall(
1285  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1286  // Barrier to terminate worker threads.
1287  syncCTAThreads(CGF);
1288  // Master thread jumps to exit point.
1289  CGF.EmitBranch(EST.ExitBB);
1290 
1291  CGF.EmitBlock(EST.ExitBB);
1292  EST.ExitBB = nullptr;
1293 }
1294 
1295 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1296  StringRef ParentName,
1297  llvm::Function *&OutlinedFn,
1298  llvm::Constant *&OutlinedFnID,
1299  bool IsOffloadEntry,
1300  const RegionCodeGenTy &CodeGen) {
1301  ExecutionRuntimeModesRAII ModeRAII(
1302  CurrentExecutionMode, RequiresFullRuntime,
1303  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1305  EntryFunctionState EST;
1306 
1307  // Emit target region as a standalone region.
1308  class NVPTXPrePostActionTy : public PrePostActionTy {
1310  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1311  const OMPExecutableDirective &D;
1312 
1313  public:
1314  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1315  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1316  const OMPExecutableDirective &D)
1317  : RT(RT), EST(EST), D(D) {}
1318  void Enter(CodeGenFunction &CGF) override {
1319  RT.emitSPMDEntryHeader(CGF, EST, D);
1320  // Skip target region initialization.
1321  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1322  }
1323  void Exit(CodeGenFunction &CGF) override {
1324  RT.clearLocThreadIdInsertPt(CGF);
1325  RT.emitSPMDEntryFooter(CGF, EST);
1326  }
1327  } Action(*this, EST, D);
1328  CodeGen.setAction(Action);
1329  IsInTTDRegion = true;
1330  // Reserve place for the globalized memory.
1331  GlobalizedRecords.emplace_back();
1332  if (!KernelStaticGlobalized) {
1333  KernelStaticGlobalized = new llvm::GlobalVariable(
1334  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1336  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1337  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1338  llvm::GlobalValue::NotThreadLocal,
1340  }
1341  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1342  IsOffloadEntry, CodeGen);
1343  IsInTTDRegion = false;
1344 }
1345 
1346 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1347  CodeGenFunction &CGF, EntryFunctionState &EST,
1348  const OMPExecutableDirective &D) {
1349  CGBuilderTy &Bld = CGF.Builder;
1350 
1351  // Setup BBs in entry function.
1352  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1353  EST.ExitBB = CGF.createBasicBlock(".exit");
1354 
1355  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1356  /*RequiresOMPRuntime=*/
1357  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1358  /*RequiresDataSharing=*/Bld.getInt16(0)};
1359  CGF.EmitRuntimeCall(
1360  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1361 
1362  if (RequiresFullRuntime) {
1363  // For data sharing, we need to initialize the stack.
1364  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1365  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1366  }
1367 
1368  CGF.EmitBranch(ExecuteBB);
1369 
1370  CGF.EmitBlock(ExecuteBB);
1371 
1372  IsInTargetMasterThreadRegion = true;
1373 }
1374 
1375 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1376  EntryFunctionState &EST) {
1377  IsInTargetMasterThreadRegion = false;
1378  if (!CGF.HaveInsertPoint())
1379  return;
1380 
1381  if (!EST.ExitBB)
1382  EST.ExitBB = CGF.createBasicBlock(".exit");
1383 
1384  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1385  CGF.EmitBranch(OMPDeInitBB);
1386 
1387  CGF.EmitBlock(OMPDeInitBB);
1388  // DeInitialize the OMP state in the runtime; called by all active threads.
1389  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1390  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1391  CGF.EmitRuntimeCall(
1392  createNVPTXRuntimeFunction(
1393  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1394  CGF.EmitBranch(EST.ExitBB);
1395 
1396  CGF.EmitBlock(EST.ExitBB);
1397  EST.ExitBB = nullptr;
1398 }
1399 
1400 // Create a unique global variable to indicate the execution mode of this target
1401 // region. The execution mode is either 'generic', or 'spmd' depending on the
1402 // target directive. This variable is picked up by the offload library to setup
1403 // the device appropriately before kernel launch. If the execution mode is
1404 // 'generic', the runtime reserves one warp for the master, otherwise, all
1405 // warps participate in parallel work.
1406 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1407  bool Mode) {
1408  auto *GVMode =
1409  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1410  llvm::GlobalValue::WeakAnyLinkage,
1411  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1412  Twine(Name, "_exec_mode"));
1413  CGM.addCompilerUsedGlobal(GVMode);
1414 }
1415 
1416 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1417  ASTContext &Ctx = CGM.getContext();
1418 
1419  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1420  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1421  WST.Loc, WST.Loc);
1422  emitWorkerLoop(CGF, WST);
1423  CGF.FinishFunction();
1424 }
1425 
1426 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1427  WorkerFunctionState &WST) {
1428  //
1429  // The workers enter this loop and wait for parallel work from the master.
1430  // When the master encounters a parallel region it sets up the work + variable
1431  // arguments, and wakes up the workers. The workers first check to see if
1432  // they are required for the parallel region, i.e., within the # of requested
1433  // parallel threads. The activated workers load the variable arguments and
1434  // execute the parallel work.
1435  //
1436 
1437  CGBuilderTy &Bld = CGF.Builder;
1438 
1439  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1440  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1441  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1442  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1443  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1444  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1445 
1446  CGF.EmitBranch(AwaitBB);
1447 
1448  // Workers wait for work from master.
1449  CGF.EmitBlock(AwaitBB);
1450  // Wait for parallel work
1451  syncCTAThreads(CGF);
1452 
1453  Address WorkFn =
1454  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1455  Address ExecStatus =
1456  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1457  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1458  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1459 
1460  // TODO: Optimize runtime initialization and pass in correct value.
1461  llvm::Value *Args[] = {WorkFn.getPointer(),
1462  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1463  llvm::Value *Ret = CGF.EmitRuntimeCall(
1464  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1465  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1466 
1467  // On termination condition (workid == 0), exit loop.
1468  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1469  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1470  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1471 
1472  // Activate requested workers.
1473  CGF.EmitBlock(SelectWorkersBB);
1474  llvm::Value *IsActive =
1475  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1476  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1477 
1478  // Signal start of parallel region.
1479  CGF.EmitBlock(ExecuteBB);
1480  // Skip initialization.
1481  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1482 
1483  // Process work items: outlined parallel functions.
1484  for (llvm::Function *W : Work) {
1485  // Try to match this outlined function.
1487 
1488  llvm::Value *WorkFnMatch =
1489  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1490 
1491  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1492  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1493  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1494 
1495  // Execute this outlined function.
1496  CGF.EmitBlock(ExecuteFNBB);
1497 
1498  // Insert call to work function via shared wrapper. The shared
1499  // wrapper takes two arguments:
1500  // - the parallelism level;
1501  // - the thread ID;
1502  emitCall(CGF, WST.Loc, W,
1503  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1504 
1505  // Go to end of parallel region.
1506  CGF.EmitBranch(TerminateBB);
1507 
1508  CGF.EmitBlock(CheckNextBB);
1509  }
1510  // Default case: call to outlined function through pointer if the target
1511  // region makes a declare target call that may contain an orphaned parallel
1512  // directive.
1513  auto *ParallelFnTy =
1514  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1515  /*isVarArg=*/false)
1516  ->getPointerTo();
1517  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1518  // Insert call to work function via shared wrapper. The shared
1519  // wrapper takes two arguments:
1520  // - the parallelism level;
1521  // - the thread ID;
1522  emitCall(CGF, WST.Loc, WorkFnCast,
1523  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1524  // Go to end of parallel region.
1525  CGF.EmitBranch(TerminateBB);
1526 
1527  // Signal end of parallel region.
1528  CGF.EmitBlock(TerminateBB);
1529  CGF.EmitRuntimeCall(
1530  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1531  llvm::None);
1532  CGF.EmitBranch(BarrierBB);
1533 
1534  // All active and inactive workers wait at a barrier after parallel region.
1535  CGF.EmitBlock(BarrierBB);
1536  // Barrier after parallel region.
1537  syncCTAThreads(CGF);
1538  CGF.EmitBranch(AwaitBB);
1539 
1540  // Exit target region.
1541  CGF.EmitBlock(ExitBB);
1542  // Skip initialization.
1543  clearLocThreadIdInsertPt(CGF);
1544 }
1545 
1546 /// Returns specified OpenMP runtime function for the current OpenMP
1547 /// implementation. Specialized for the NVPTX device.
1548 /// \param Function OpenMP runtime function.
1549 /// \return Specified function.
1550 llvm::Constant *
1552  llvm::Constant *RTLFn = nullptr;
1553  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1554  case OMPRTL_NVPTX__kmpc_kernel_init: {
1555  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1556  // RequiresOMPRuntime);
1557  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1558  auto *FnTy =
1559  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1560  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1561  break;
1562  }
1563  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1564  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1565  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1566  auto *FnTy =
1567  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1568  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1569  break;
1570  }
1571  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1572  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1573  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1574  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1575  auto *FnTy =
1576  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1577  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1578  break;
1579  }
1580  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1581  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1582  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1583  auto *FnTy =
1584  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1585  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1586  break;
1587  }
1588  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1589  /// Build void __kmpc_kernel_prepare_parallel(
1590  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1591  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1592  auto *FnTy =
1593  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1594  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1595  break;
1596  }
1597  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1598  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1599  /// int16_t IsOMPRuntimeInitialized);
1600  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1601  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1602  auto *FnTy =
1603  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1604  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1605  break;
1606  }
1607  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1608  /// Build void __kmpc_kernel_end_parallel();
1609  auto *FnTy =
1610  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1611  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1612  break;
1613  }
1614  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1615  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1616  // global_tid);
1617  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1618  auto *FnTy =
1619  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1620  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1621  break;
1622  }
1623  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1624  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1625  // global_tid);
1626  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1627  auto *FnTy =
1628  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1629  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1630  break;
1631  }
1632  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1633  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1634  // int16_t lane_offset, int16_t warp_size);
1635  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1636  auto *FnTy =
1637  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1638  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1639  break;
1640  }
1641  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1642  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1643  // int16_t lane_offset, int16_t warp_size);
1644  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1645  auto *FnTy =
1646  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1647  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1648  break;
1649  }
1650  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: {
1651  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1652  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1653  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1654  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1655  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1656  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1657  CGM.Int16Ty, CGM.Int16Ty};
1658  auto *ShuffleReduceFnTy =
1659  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1660  /*isVarArg=*/false);
1661  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1662  auto *InterWarpCopyFnTy =
1663  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1664  /*isVarArg=*/false);
1665  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1666  CGM.Int32Ty,
1667  CGM.Int32Ty,
1668  CGM.SizeTy,
1669  CGM.VoidPtrTy,
1670  ShuffleReduceFnTy->getPointerTo(),
1671  InterWarpCopyFnTy->getPointerTo()};
1672  auto *FnTy =
1673  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1674  RTLFn = CGM.CreateRuntimeFunction(
1675  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1676  break;
1677  }
1678  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1679  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1680  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1681  auto *FnTy =
1682  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1683  RTLFn = CGM.CreateRuntimeFunction(
1684  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1685  break;
1686  }
1687  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple: {
1688  // Build __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32
1689  // global_tid, kmp_critical_name *lck)
1690  llvm::Type *TypeParams[] = {
1691  getIdentTyPointerTy(), CGM.Int32Ty,
1692  llvm::PointerType::getUnqual(getKmpCriticalNameTy())};
1693  auto *FnTy =
1694  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1695  RTLFn = CGM.CreateRuntimeFunction(
1696  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_simple");
1697  break;
1698  }
1699  case OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple: {
1700  // Build __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, kmp_int32
1701  // global_tid, kmp_critical_name *lck)
1702  llvm::Type *TypeParams[] = {
1703  getIdentTyPointerTy(), CGM.Int32Ty,
1704  llvm::PointerType::getUnqual(getKmpCriticalNameTy())};
1705  auto *FnTy =
1706  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1707  RTLFn = CGM.CreateRuntimeFunction(
1708  FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple");
1709  break;
1710  }
1711  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1712  /// Build void __kmpc_data_sharing_init_stack();
1713  auto *FnTy =
1714  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1715  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1716  break;
1717  }
1718  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1719  /// Build void __kmpc_data_sharing_init_stack_spmd();
1720  auto *FnTy =
1721  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1722  RTLFn =
1723  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1724  break;
1725  }
1726  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1727  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1728  // int16_t UseSharedMemory);
1729  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1730  auto *FnTy =
1731  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1732  RTLFn = CGM.CreateRuntimeFunction(
1733  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1734  break;
1735  }
1736  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1737  // Build void __kmpc_data_sharing_pop_stack(void *a);
1738  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1739  auto *FnTy =
1740  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1741  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1742  /*Name=*/"__kmpc_data_sharing_pop_stack");
1743  break;
1744  }
1745  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1746  /// Build void __kmpc_begin_sharing_variables(void ***args,
1747  /// size_t n_args);
1748  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1749  auto *FnTy =
1750  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1751  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1752  break;
1753  }
1754  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1755  /// Build void __kmpc_end_sharing_variables();
1756  auto *FnTy =
1757  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1758  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1759  break;
1760  }
1761  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1762  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1763  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1764  auto *FnTy =
1765  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1766  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1767  break;
1768  }
1769  case OMPRTL_NVPTX__kmpc_parallel_level: {
1770  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1771  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1772  auto *FnTy =
1773  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1774  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1775  break;
1776  }
1777  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1778  // Build int8_t __kmpc_is_spmd_exec_mode();
1779  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1780  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1781  break;
1782  }
1783  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1784  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1785  // const void *buf, size_t size, int16_t is_shared, const void **res);
1786  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1787  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1788  auto *FnTy =
1789  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1790  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1791  break;
1792  }
1793  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1794  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1795  // int16_t is_shared);
1796  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1797  auto *FnTy =
1798  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1799  RTLFn =
1800  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1801  break;
1802  }
1803  case OMPRTL__kmpc_barrier: {
1804  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1805  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1806  auto *FnTy =
1807  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1808  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1809  cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
1810  break;
1811  }
1812  case OMPRTL__kmpc_barrier_simple_spmd: {
1813  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1814  // global_tid);
1815  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1816  auto *FnTy =
1817  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1818  RTLFn =
1819  CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1820  cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
1821  break;
1822  }
1823  }
1824  return RTLFn;
1825 }
1826 
1827 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1828  llvm::Constant *Addr,
1829  uint64_t Size, int32_t,
1830  llvm::GlobalValue::LinkageTypes) {
1831  // TODO: Add support for global variables on the device after declare target
1832  // support.
1833  if (!isa<llvm::Function>(Addr))
1834  return;
1835  llvm::Module &M = CGM.getModule();
1836  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1837 
1838  // Get "nvvm.annotations" metadata node
1839  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1840 
1841  llvm::Metadata *MDVals[] = {
1842  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1843  llvm::ConstantAsMetadata::get(
1844  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1845  // Append metadata to nvvm.annotations
1846  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1847 }
1848 
1849 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1850  const OMPExecutableDirective &D, StringRef ParentName,
1851  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1852  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1853  if (!IsOffloadEntry) // Nothing to do.
1854  return;
1855 
1856  assert(!ParentName.empty() && "Invalid target region parent name!");
1857 
1858  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1859  if (Mode)
1860  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1861  CodeGen);
1862  else
1863  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1864  CodeGen);
1865 
1866  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1867 }
1868 
1869 namespace {
1871 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1872 enum ModeFlagsTy : unsigned {
1873  /// Bit set to 1 when in SPMD mode.
1874  KMP_IDENT_SPMD_MODE = 0x01,
1875  /// Bit set to 1 when a simplified runtime is used.
1876  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1877  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1878 };
1879 
1880 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1881 static const ModeFlagsTy UndefinedMode =
1882  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1883 } // anonymous namespace
1884 
1886  switch (getExecutionMode()) {
1887  case EM_SPMD:
1888  if (requiresFullRuntime())
1889  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1890  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1891  case EM_NonSPMD:
1892  assert(requiresFullRuntime() && "Expected full runtime.");
1893  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1894  case EM_Unknown:
1895  return UndefinedMode;
1896  }
1897  llvm_unreachable("Unknown flags are requested.");
1898 }
1899 
1901  : CGOpenMPRuntime(CGM, "_", "$") {
1902  if (!CGM.getLangOpts().OpenMPIsDevice)
1903  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1904 }
1905 
1907  OpenMPProcBindClauseKind ProcBind,
1908  SourceLocation Loc) {
1909  // Do nothing in case of SPMD mode and L0 parallel.
1910  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1911  return;
1912 
1913  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1914 }
1915 
1917  llvm::Value *NumThreads,
1918  SourceLocation Loc) {
1919  // Do nothing in case of SPMD mode and L0 parallel.
1920  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1921  return;
1922 
1923  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1924 }
1925 
1927  const Expr *NumTeams,
1928  const Expr *ThreadLimit,
1929  SourceLocation Loc) {}
1930 
1932  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1933  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1934  // Emit target region as a standalone region.
1935  class NVPTXPrePostActionTy : public PrePostActionTy {
1936  bool &IsInParallelRegion;
1937  bool PrevIsInParallelRegion;
1938 
1939  public:
1940  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1941  : IsInParallelRegion(IsInParallelRegion) {}
1942  void Enter(CodeGenFunction &CGF) override {
1943  PrevIsInParallelRegion = IsInParallelRegion;
1944  IsInParallelRegion = true;
1945  }
1946  void Exit(CodeGenFunction &CGF) override {
1947  IsInParallelRegion = PrevIsInParallelRegion;
1948  }
1949  } Action(IsInParallelRegion);
1950  CodeGen.setAction(Action);
1951  bool PrevIsInTTDRegion = IsInTTDRegion;
1952  IsInTTDRegion = false;
1953  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1954  IsInTargetMasterThreadRegion = false;
1955  auto *OutlinedFun =
1956  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1957  D, ThreadIDVar, InnermostKind, CodeGen));
1958  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1959  IsInTTDRegion = PrevIsInTTDRegion;
1960  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1961  !IsInParallelRegion) {
1962  llvm::Function *WrapperFun =
1963  createParallelDataSharingWrapper(OutlinedFun, D);
1964  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1965  }
1966 
1967  return OutlinedFun;
1968 }
1969 
1970 /// Get list of lastprivate variables from the teams distribute ... or
1971 /// teams {distribute ...} directives.
1972 static void
1976  "expected teams directive.");
1977  const OMPExecutableDirective *Dir = &D;
1979  if (const Stmt *S = getSingleCompoundChild(
1980  Ctx,
1982  /*IgnoreCaptured=*/true))) {
1983  Dir = dyn_cast<OMPExecutableDirective>(S);
1984  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1985  Dir = nullptr;
1986  }
1987  }
1988  if (!Dir)
1989  return;
1990  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1991  for (const Expr *E : C->getVarRefs())
1992  Vars.push_back(getPrivateItem(E));
1993  }
1994 }
1995 
1996 /// Get list of reduction variables from the teams ... directives.
1997 static void
2001  "expected teams directive.");
2002  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2003  for (const Expr *E : C->privates())
2004  Vars.push_back(getPrivateItem(E));
2005  }
2006 }
2007 
2009  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2010  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2011  SourceLocation Loc = D.getBeginLoc();
2012 
2013  const RecordDecl *GlobalizedRD = nullptr;
2014  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2015  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2016  // Globalize team reductions variable unconditionally in all modes.
2017  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2018  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2019  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2020  if (!LastPrivatesReductions.empty()) {
2021  GlobalizedRD = ::buildRecordForGlobalizedVars(
2022  CGM.getContext(), llvm::None, LastPrivatesReductions,
2023  MappedDeclsFields);
2024  }
2025  } else if (!LastPrivatesReductions.empty()) {
2026  assert(!TeamAndReductions.first &&
2027  "Previous team declaration is not expected.");
2028  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2029  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2030  }
2031 
2032  // Emit target region as a standalone region.
2033  class NVPTXPrePostActionTy : public PrePostActionTy {
2034  SourceLocation &Loc;
2035  const RecordDecl *GlobalizedRD;
2036  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2037  &MappedDeclsFields;
2038 
2039  public:
2040  NVPTXPrePostActionTy(
2041  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2042  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2043  &MappedDeclsFields)
2044  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2045  MappedDeclsFields(MappedDeclsFields) {}
2046  void Enter(CodeGenFunction &CGF) override {
2047  auto &Rt =
2048  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2049  if (GlobalizedRD) {
2050  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2051  I->getSecond().GlobalRecord = GlobalizedRD;
2052  I->getSecond().MappedParams =
2053  llvm::make_unique<CodeGenFunction::OMPMapVars>();
2054  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2055  for (const auto &Pair : MappedDeclsFields) {
2056  assert(Pair.getFirst()->isCanonicalDecl() &&
2057  "Expected canonical declaration");
2058  Data.insert(std::make_pair(Pair.getFirst(),
2059  MappedVarData(Pair.getSecond(),
2060  /*IsOnePerTeam=*/true)));
2061  }
2062  }
2063  Rt.emitGenericVarsProlog(CGF, Loc);
2064  }
2065  void Exit(CodeGenFunction &CGF) override {
2066  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2067  .emitGenericVarsEpilog(CGF);
2068  }
2069  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2070  CodeGen.setAction(Action);
2072  D, ThreadIDVar, InnermostKind, CodeGen);
2073  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
2074  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2075  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2076  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2077 
2078  return OutlinedFun;
2079 }
2080 
2081 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2082  SourceLocation Loc,
2083  bool WithSPMDCheck) {
2085  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2086  return;
2087 
2088  CGBuilderTy &Bld = CGF.Builder;
2089 
2090  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2091  if (I == FunctionGlobalizedDecls.end())
2092  return;
2093  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2094  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2095  QualType SecGlobalRecTy;
2096 
2097  // Recover pointer to this function's global record. The runtime will
2098  // handle the specifics of the allocation of the memory.
2099  // Use actual memory size of the record including the padding
2100  // for alignment purposes.
2101  unsigned Alignment =
2102  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2103  unsigned GlobalRecordSize =
2104  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2105  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2106 
2107  llvm::PointerType *GlobalRecPtrTy =
2108  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2109  llvm::Value *GlobalRecCastAddr;
2110  llvm::Value *IsTTD = nullptr;
2111  if (!IsInTTDRegion &&
2112  (WithSPMDCheck ||
2113  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2114  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2115  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2116  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2117  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2118  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2119  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2120  llvm::Value *PL = CGF.EmitRuntimeCall(
2121  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2122  {RTLoc, ThreadID});
2123  IsTTD = Bld.CreateIsNull(PL);
2124  }
2125  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2126  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2127  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2128  // There is no need to emit line number for unconditional branch.
2130  CGF.EmitBlock(SPMDBB);
2131  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2132  CharUnits::fromQuantity(Alignment));
2133  CGF.EmitBranch(ExitBB);
2134  // There is no need to emit line number for unconditional branch.
2136  CGF.EmitBlock(NonSPMDBB);
2137  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2138  if (const RecordDecl *SecGlobalizedVarsRecord =
2139  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2140  SecGlobalRecTy =
2141  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2142 
2143  // Recover pointer to this function's global record. The runtime will
2144  // handle the specifics of the allocation of the memory.
2145  // Use actual memory size of the record including the padding
2146  // for alignment purposes.
2147  unsigned Alignment =
2148  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2149  unsigned GlobalRecordSize =
2150  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2151  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2152  Size = Bld.CreateSelect(
2153  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2154  }
2155  // TODO: allow the usage of shared memory to be controlled by
2156  // the user, for now, default to global.
2157  llvm::Value *GlobalRecordSizeArg[] = {
2158  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2159  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2161  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2162  GlobalRecordSizeArg);
2163  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2164  GlobalRecValue, GlobalRecPtrTy);
2165  CGF.EmitBlock(ExitBB);
2166  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2167  /*NumReservedValues=*/2, "_select_stack");
2168  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2169  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2170  GlobalRecCastAddr = Phi;
2171  I->getSecond().GlobalRecordAddr = Phi;
2172  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2173  } else if (IsInTTDRegion) {
2174  assert(GlobalizedRecords.back().Records.size() < 2 &&
2175  "Expected less than 2 globalized records: one for target and one "
2176  "for teams.");
2177  unsigned Offset = 0;
2178  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2179  QualType RDTy = CGM.getContext().getRecordType(RD);
2180  unsigned Alignment =
2182  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2183  Offset =
2184  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2185  }
2186  unsigned Alignment =
2187  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2188  Offset = llvm::alignTo(Offset, Alignment);
2189  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2190  ++GlobalizedRecords.back().RegionCounter;
2191  if (GlobalizedRecords.back().Records.size() == 1) {
2192  assert(KernelStaticGlobalized &&
2193  "Kernel static pointer must be initialized already.");
2194  auto *UseSharedMemory = new llvm::GlobalVariable(
2195  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2197  "_openmp_static_kernel$is_shared");
2198  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2199  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2200  /*DestWidth=*/16, /*Signed=*/0);
2201  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2202  Address(UseSharedMemory,
2203  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2204  /*Volatile=*/false, Int16Ty, Loc);
2205  auto *StaticGlobalized = new llvm::GlobalVariable(
2206  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2207  llvm::GlobalValue::CommonLinkage, nullptr);
2208  auto *RecSize = new llvm::GlobalVariable(
2209  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2211  "_openmp_static_kernel$size");
2212  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2213  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2214  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2215  CGM.getContext().getSizeType(), Loc);
2217  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2218  llvm::Value *GlobalRecordSizeArg[] = {
2219  llvm::ConstantInt::get(
2220  CGM.Int16Ty,
2221  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2222  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2224  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2225  GlobalRecordSizeArg);
2226  GlobalizedRecords.back().Buffer = StaticGlobalized;
2227  GlobalizedRecords.back().RecSize = RecSize;
2228  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2229  GlobalizedRecords.back().Loc = Loc;
2230  }
2231  assert(KernelStaticGlobalized && "Global address must be set already.");
2232  Address FrameAddr = CGF.EmitLoadOfPointer(
2233  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2234  CGM.getContext()
2236  .castAs<PointerType>());
2237  llvm::Value *GlobalRecValue =
2238  Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One())
2239  .getPointer();
2240  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2241  I->getSecond().IsInSPMDModeFlag = nullptr;
2242  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2243  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2244  } else {
2245  // TODO: allow the usage of shared memory to be controlled by
2246  // the user, for now, default to global.
2247  llvm::Value *GlobalRecordSizeArg[] = {
2248  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2249  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2250  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2252  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2253  GlobalRecordSizeArg);
2254  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2255  GlobalRecValue, GlobalRecPtrTy);
2256  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2257  I->getSecond().IsInSPMDModeFlag = nullptr;
2258  }
2259  LValue Base =
2260  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2261 
2262  // Emit the "global alloca" which is a GEP from the global declaration
2263  // record using the pointer returned by the runtime.
2264  LValue SecBase;
2265  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2266  if (IsTTD) {
2267  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2268  llvm::PointerType *SecGlobalRecPtrTy =
2269  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2270  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2272  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2273  SecGlobalRecTy);
2274  }
2275  for (auto &Rec : I->getSecond().LocalVarData) {
2276  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2277  llvm::Value *ParValue;
2278  if (EscapedParam) {
2279  const auto *VD = cast<VarDecl>(Rec.first);
2280  LValue ParLVal =
2281  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2282  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2283  }
2284  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2285  // Emit VarAddr basing on lane-id if required.
2286  QualType VarTy;
2287  if (Rec.second.IsOnePerTeam) {
2288  VarTy = Rec.second.FD->getType();
2289  } else {
2290  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2291  VarAddr.getAddress().getPointer(),
2292  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2293  VarTy =
2294  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2295  VarAddr = CGF.MakeAddrLValue(
2296  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2298  }
2299  Rec.second.PrivateAddr = VarAddr.getAddress();
2300  if (!IsInTTDRegion &&
2301  (WithSPMDCheck ||
2302  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2303  assert(I->getSecond().IsInSPMDModeFlag &&
2304  "Expected unknown execution mode or required SPMD check.");
2305  if (IsTTD) {
2306  assert(SecIt->second.IsOnePerTeam &&
2307  "Secondary glob data must be one per team.");
2308  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2309  VarAddr.setAddress(
2310  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2311  VarAddr.getPointer()),
2312  VarAddr.getAlignment()));
2313  Rec.second.PrivateAddr = VarAddr.getAddress();
2314  }
2315  Address GlobalPtr = Rec.second.PrivateAddr;
2316  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2317  Rec.second.PrivateAddr = Address(
2318  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2319  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2320  LocalAddr.getAlignment());
2321  }
2322  if (EscapedParam) {
2323  const auto *VD = cast<VarDecl>(Rec.first);
2324  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2325  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2326  }
2327  if (IsTTD)
2328  ++SecIt;
2329  }
2330  }
2331  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2332  // Recover pointer to this function's global record. The runtime will
2333  // handle the specifics of the allocation of the memory.
2334  // Use actual memory size of the record including the padding
2335  // for alignment purposes.
2336  CGBuilderTy &Bld = CGF.Builder;
2337  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2338  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2339  Size = Bld.CreateNUWAdd(
2340  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2341  llvm::Value *AlignVal =
2342  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2343  Size = Bld.CreateUDiv(Size, AlignVal);
2344  Size = Bld.CreateNUWMul(Size, AlignVal);
2345  // TODO: allow the usage of shared memory to be controlled by
2346  // the user, for now, default to global.
2347  llvm::Value *GlobalRecordSizeArg[] = {
2348  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2349  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2351  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2352  GlobalRecordSizeArg);
2353  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2354  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2355  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2356  CGM.getContext().getDeclAlign(VD),
2358  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2359  Base.getAddress());
2360  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2361  }
2362  I->getSecond().MappedParams->apply(CGF);
2363 }
2364 
2365 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2366  bool WithSPMDCheck) {
2368  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2369  return;
2370 
2371  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2372  if (I != FunctionGlobalizedDecls.end()) {
2373  I->getSecond().MappedParams->restore(CGF);
2374  if (!CGF.HaveInsertPoint())
2375  return;
2376  for (llvm::Value *Addr :
2377  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2378  CGF.EmitRuntimeCall(
2379  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2380  Addr);
2381  }
2382  if (I->getSecond().GlobalRecordAddr) {
2383  if (!IsInTTDRegion &&
2384  (WithSPMDCheck ||
2385  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2386  CGBuilderTy &Bld = CGF.Builder;
2387  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2388  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2389  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2390  // There is no need to emit line number for unconditional branch.
2392  CGF.EmitBlock(NonSPMDBB);
2393  CGF.EmitRuntimeCall(
2395  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2396  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2397  CGF.EmitBlock(ExitBB);
2398  } else if (IsInTTDRegion) {
2399  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2400  "region counter must be > 0.");
2401  --GlobalizedRecords.back().RegionCounter;
2402  // Emit the restore function only in the target region.
2403  if (GlobalizedRecords.back().RegionCounter == 0) {
2404  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2405  /*DestWidth=*/16, /*Signed=*/0);
2406  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2407  Address(GlobalizedRecords.back().UseSharedMemory,
2408  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2409  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2410  llvm::Value *Args[] = {
2411  llvm::ConstantInt::get(
2412  CGM.Int16Ty,
2413  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2414  IsInSharedMemory};
2415  CGF.EmitRuntimeCall(
2417  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2418  Args);
2419  }
2420  } else {
2422  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2423  I->getSecond().GlobalRecordAddr);
2424  }
2425  }
2426  }
2427 }
2428 
2430  const OMPExecutableDirective &D,
2431  SourceLocation Loc,
2432  llvm::Value *OutlinedFn,
2433  ArrayRef<llvm::Value *> CapturedVars) {
2434  if (!CGF.HaveInsertPoint())
2435  return;
2436 
2437  Address ZeroAddr = CGF.CreateMemTemp(
2438  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2439  /*Name*/ ".zero.addr");
2440  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2441  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2442  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2443  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2444  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2445  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2446 }
2447 
2449  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2450  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2451  if (!CGF.HaveInsertPoint())
2452  return;
2453 
2454  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2455  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2456  else
2457  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2458 }
2459 
2460 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2461  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2462  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2463  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2464 
2465  // Force inline this outlined function at its call site.
2466  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2467 
2469  /*DestWidth=*/32, /*Signed=*/1),
2470  ".zero.addr");
2471  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2472  // ThreadId for serialized parallels is 0.
2473  Address ThreadIDAddr = ZeroAddr;
2474  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2475  CodeGenFunction &CGF, PrePostActionTy &Action) {
2476  Action.Enter(CGF);
2477 
2478  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2479  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2480  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2481  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2482  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2483  };
2484  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2485  PrePostActionTy &) {
2486 
2487  RegionCodeGenTy RCG(CodeGen);
2488  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2489  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2490  llvm::Value *Args[] = {RTLoc, ThreadID};
2491 
2492  NVPTXActionTy Action(
2493  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2494  Args,
2495  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2496  Args);
2497  RCG.setAction(Action);
2498  RCG(CGF);
2499  };
2500 
2501  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2502  PrePostActionTy &Action) {
2503  CGBuilderTy &Bld = CGF.Builder;
2504  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2505  assert(WFn && "Wrapper function does not exist!");
2506  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2507 
2508  // Prepare for parallel region. Indicate the outlined function.
2509  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2510  CGF.EmitRuntimeCall(
2511  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2512  Args);
2513 
2514  // Create a private scope that will globalize the arguments
2515  // passed from the outside of the target region.
2516  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2517 
2518  // There's something to share.
2519  if (!CapturedVars.empty()) {
2520  // Prepare for parallel region. Indicate the outlined function.
2521  Address SharedArgs =
2522  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2523  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2524 
2525  llvm::Value *DataSharingArgs[] = {
2526  SharedArgsPtr,
2527  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2528  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2529  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2530  DataSharingArgs);
2531 
2532  // Store variable address in a list of references to pass to workers.
2533  unsigned Idx = 0;
2534  ASTContext &Ctx = CGF.getContext();
2535  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2536  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2537  .castAs<PointerType>());
2538  for (llvm::Value *V : CapturedVars) {
2539  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2540  CGF.getPointerSize());
2541  llvm::Value *PtrV;
2542  if (V->getType()->isIntegerTy())
2543  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2544  else
2545  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2546  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2547  Ctx.getPointerType(Ctx.VoidPtrTy));
2548  ++Idx;
2549  }
2550  }
2551 
2552  // Activate workers. This barrier is used by the master to signal
2553  // work for the workers.
2554  syncCTAThreads(CGF);
2555 
2556  // OpenMP [2.5, Parallel Construct, p.49]
2557  // There is an implied barrier at the end of a parallel region. After the
2558  // end of a parallel region, only the master thread of the team resumes
2559  // execution of the enclosing task region.
2560  //
2561  // The master waits at this barrier until all workers are done.
2562  syncCTAThreads(CGF);
2563 
2564  if (!CapturedVars.empty())
2565  CGF.EmitRuntimeCall(
2566  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2567 
2568  // Remember for post-processing in worker loop.
2569  Work.emplace_back(WFn);
2570  };
2571 
2572  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2573  CodeGenFunction &CGF, PrePostActionTy &Action) {
2574  if (IsInParallelRegion) {
2575  SeqGen(CGF, Action);
2576  } else if (IsInTargetMasterThreadRegion) {
2577  L0ParallelGen(CGF, Action);
2578  } else {
2579  // Check for master and then parallelism:
2580  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2581  // Serialized execution.
2582  // } else {
2583  // Worker call.
2584  // }
2585  CGBuilderTy &Bld = CGF.Builder;
2586  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2587  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2588  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2589  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2590  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2591  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2592  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2593  // There is no need to emit line number for unconditional branch.
2595  CGF.EmitBlock(ParallelCheckBB);
2596  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2597  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2598  llvm::Value *PL = CGF.EmitRuntimeCall(
2599  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2600  {RTLoc, ThreadID});
2601  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2602  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2603  CGF.EmitBlock(SeqBB);
2604  SeqGen(CGF, Action);
2605  CGF.EmitBranch(ExitBB);
2606  // There is no need to emit line number for unconditional branch.
2608  CGF.EmitBlock(MasterBB);
2609  L0ParallelGen(CGF, Action);
2610  CGF.EmitBranch(ExitBB);
2611  // There is no need to emit line number for unconditional branch.
2613  // Emit the continuation block for code after the if.
2614  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2615  }
2616  };
2617 
2618  if (IfCond) {
2619  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2620  } else {
2622  RegionCodeGenTy ThenRCG(LNParallelGen);
2623  ThenRCG(CGF);
2624  }
2625 }
2626 
2627 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2628  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2629  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2630  // Just call the outlined function to execute the parallel region.
2631  // OutlinedFn(&GTid, &zero, CapturedStruct);
2632  //
2633  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2634 
2636  /*DestWidth=*/32, /*Signed=*/1),
2637  ".zero.addr");
2638  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2639  // ThreadId for serialized parallels is 0.
2640  Address ThreadIDAddr = ZeroAddr;
2641  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2642  &ThreadIDAddr](CodeGenFunction &CGF,
2643  PrePostActionTy &Action) {
2644  Action.Enter(CGF);
2645 
2646  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2647  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2648  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2649  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2650  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2651  };
2652  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2653  PrePostActionTy &) {
2654 
2655  RegionCodeGenTy RCG(CodeGen);
2656  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2657  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2658  llvm::Value *Args[] = {RTLoc, ThreadID};
2659 
2660  NVPTXActionTy Action(
2661  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2662  Args,
2663  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2664  Args);
2665  RCG.setAction(Action);
2666  RCG(CGF);
2667  };
2668 
2669  if (IsInTargetMasterThreadRegion) {
2670  // In the worker need to use the real thread id.
2671  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2672  RegionCodeGenTy RCG(CodeGen);
2673  RCG(CGF);
2674  } else {
2675  // If we are not in the target region, it is definitely L2 parallelism or
2676  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2677  // need to check for orphaned directives.
2678  RegionCodeGenTy RCG(SeqGen);
2679  RCG(CGF);
2680  }
2681 }
2682 
2683 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2684  // Always emit simple barriers!
2685  if (!CGF.HaveInsertPoint())
2686  return;
2687  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2688  // This function does not use parameters, so we can emit just default values.
2689  llvm::Value *Args[] = {
2690  llvm::ConstantPointerNull::get(
2691  cast<llvm::PointerType>(getIdentTyPointerTy())),
2692  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2693  CGF.EmitRuntimeCall(
2694  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2695 }
2696 
2698  SourceLocation Loc,
2699  OpenMPDirectiveKind Kind, bool,
2700  bool) {
2701  // Always emit simple barriers!
2702  if (!CGF.HaveInsertPoint())
2703  return;
2704  // Build call __kmpc_cancel_barrier(loc, thread_id);
2705  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2706  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2707  getThreadID(CGF, Loc)};
2709 }
2710 
2712  CodeGenFunction &CGF, StringRef CriticalName,
2713  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2714  const Expr *Hint) {
2715  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2716  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2717  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2718  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2719  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2720 
2721  // Fetch team-local id of the thread.
2722  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2723 
2724  // Get the width of the team.
2725  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2726 
2727  // Initialize the counter variable for the loop.
2728  QualType Int32Ty =
2729  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2730  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2731  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2732  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2733  /*isInit=*/true);
2734 
2735  // Block checks if loop counter exceeds upper bound.
2736  CGF.EmitBlock(LoopBB);
2737  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2738  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2739  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2740 
2741  // Block tests which single thread should execute region, and which threads
2742  // should go straight to synchronisation point.
2743  CGF.EmitBlock(TestBB);
2744  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2745  llvm::Value *CmpThreadToCounter =
2746  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2747  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2748 
2749  // Block emits the body of the critical region.
2750  CGF.EmitBlock(BodyBB);
2751 
2752  // Output the critical statement.
2753  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2754  Hint);
2755 
2756  // After the body surrounded by the critical region, the single executing
2757  // thread will jump to the synchronisation point.
2758  // Block waits for all threads in current team to finish then increments the
2759  // counter variable and returns to the loop.
2760  CGF.EmitBlock(SyncBB);
2761  emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
2762  /*ForceSimpleCall=*/true);
2763 
2764  llvm::Value *IncCounterVal =
2765  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2766  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2767  CGF.EmitBranch(LoopBB);
2768 
2769  // Block that is reached when all threads in the team complete the region.
2770  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2771 }
2772 
2773 /// Cast value to the specified type.
2775  QualType ValTy, QualType CastTy,
2776  SourceLocation Loc) {
2777  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2778  "Cast type must sized.");
2779  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2780  "Val type must sized.");
2781  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2782  if (ValTy == CastTy)
2783  return Val;
2784  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2785  CGF.getContext().getTypeSizeInChars(CastTy))
2786  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2787  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2788  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2789  CastTy->hasSignedIntegerRepresentation());
2790  Address CastItem = CGF.CreateMemTemp(CastTy);
2792  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2793  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2794  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2795 }
2796 
2797 /// This function creates calls to one of two shuffle functions to copy
2798 /// variables between lanes in a warp.
2800  llvm::Value *Elem,
2801  QualType ElemType,
2803  SourceLocation Loc) {
2804  CodeGenModule &CGM = CGF.CGM;
2805  CGBuilderTy &Bld = CGF.Builder;
2806  CGOpenMPRuntimeNVPTX &RT =
2807  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2808 
2809  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2810  assert(Size.getQuantity() <= 8 &&
2811  "Unsupported bitwidth in shuffle instruction.");
2812 
2813  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2814  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2815  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2816 
2817  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2818  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2819  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2820  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2821  llvm::Value *WarpSize =
2822  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2823 
2824  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2825  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2826 
2827  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2828 }
2829 
2830 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2831  Address DestAddr, QualType ElemType,
2833  CGBuilderTy &Bld = CGF.Builder;
2834 
2835  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2836  // Create the loop over the big sized data.
2837  // ptr = (void*)Elem;
2838  // ptrEnd = (void*) Elem + 1;
2839  // Step = 8;
2840  // while (ptr + Step < ptrEnd)
2841  // shuffle((int64_t)*ptr);
2842  // Step = 4;
2843  // while (ptr + Step < ptrEnd)
2844  // shuffle((int32_t)*ptr);
2845  // ...
2846  Address ElemPtr = DestAddr;
2847  Address Ptr = SrcAddr;
2849  Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2850  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2851  if (Size < CharUnits::fromQuantity(IntSize))
2852  continue;
2853  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2854  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2855  /*Signed=*/1);
2856  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2857  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2858  ElemPtr =
2859  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2860  if (Size.getQuantity() / IntSize > 1) {
2861  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2862  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2863  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2864  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2865  CGF.EmitBlock(PreCondBB);
2866  llvm::PHINode *PhiSrc =
2867  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2868  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2869  llvm::PHINode *PhiDest =
2870  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2871  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2872  Ptr = Address(PhiSrc, Ptr.getAlignment());
2873  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2874  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2876  Ptr.getPointer(), CGF.VoidPtrTy));
2877  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2878  ThenBB, ExitBB);
2879  CGF.EmitBlock(ThenBB);
2881  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2882  IntType, Offset, Loc);
2883  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2884  Address LocalPtr =
2885  Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2886  Address LocalElemPtr =
2887  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2888  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2889  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2890  CGF.EmitBranch(PreCondBB);
2891  CGF.EmitBlock(ExitBB);
2892  } else {
2894  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2895  IntType, Offset, Loc);
2896  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2897  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2898  ElemPtr =
2899  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2900  }
2901  Size = Size % IntSize;
2902  }
2903 }
2904 
2905 namespace {
2906 enum CopyAction : unsigned {
2907  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2908  // the warp using shuffle instructions.
2909  RemoteLaneToThread,
2910  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2911  ThreadCopy,
2912  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2913  ThreadToScratchpad,
2914  // ScratchpadToThread: Copy from a scratchpad array in global memory
2915  // containing team-reduced data to a thread's stack.
2916  ScratchpadToThread,
2917 };
2918 } // namespace
2919 
2924 };
2925 
2926 /// Emit instructions to copy a Reduce list, which contains partially
2927 /// aggregated values, in the specified direction.
2929  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2930  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2931  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2932 
2933  CodeGenModule &CGM = CGF.CGM;
2934  ASTContext &C = CGM.getContext();
2935  CGBuilderTy &Bld = CGF.Builder;
2936 
2937  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2938  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2939  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2940 
2941  // Iterates, element-by-element, through the source Reduce list and
2942  // make a copy.
2943  unsigned Idx = 0;
2944  unsigned Size = Privates.size();
2945  for (const Expr *Private : Privates) {
2946  Address SrcElementAddr = Address::invalid();
2947  Address DestElementAddr = Address::invalid();
2948  Address DestElementPtrAddr = Address::invalid();
2949  // Should we shuffle in an element from a remote lane?
2950  bool ShuffleInElement = false;
2951  // Set to true to update the pointer in the dest Reduce list to a
2952  // newly created element.
2953  bool UpdateDestListPtr = false;
2954  // Increment the src or dest pointer to the scratchpad, for each
2955  // new element.
2956  bool IncrScratchpadSrc = false;
2957  bool IncrScratchpadDest = false;
2958 
2959  switch (Action) {
2960  case RemoteLaneToThread: {
2961  // Step 1.1: Get the address for the src element in the Reduce list.
2962  Address SrcElementPtrAddr =
2963  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2964  SrcElementAddr = CGF.EmitLoadOfPointer(
2965  SrcElementPtrAddr,
2966  C.getPointerType(Private->getType())->castAs<PointerType>());
2967 
2968  // Step 1.2: Create a temporary to store the element in the destination
2969  // Reduce list.
2970  DestElementPtrAddr =
2971  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2972  DestElementAddr =
2973  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2974  ShuffleInElement = true;
2975  UpdateDestListPtr = true;
2976  break;
2977  }
2978  case ThreadCopy: {
2979  // Step 1.1: Get the address for the src element in the Reduce list.
2980  Address SrcElementPtrAddr =
2981  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2982  SrcElementAddr = CGF.EmitLoadOfPointer(
2983  SrcElementPtrAddr,
2984  C.getPointerType(Private->getType())->castAs<PointerType>());
2985 
2986  // Step 1.2: Get the address for dest element. The destination
2987  // element has already been created on the thread's stack.
2988  DestElementPtrAddr =
2989  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2990  DestElementAddr = CGF.EmitLoadOfPointer(
2991  DestElementPtrAddr,
2992  C.getPointerType(Private->getType())->castAs<PointerType>());
2993  break;
2994  }
2995  case ThreadToScratchpad: {
2996  // Step 1.1: Get the address for the src element in the Reduce list.
2997  Address SrcElementPtrAddr =
2998  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2999  SrcElementAddr = CGF.EmitLoadOfPointer(
3000  SrcElementPtrAddr,
3001  C.getPointerType(Private->getType())->castAs<PointerType>());
3002 
3003  // Step 1.2: Get the address for dest element:
3004  // address = base + index * ElementSizeInChars.
3005  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3006  llvm::Value *CurrentOffset =
3007  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3008  llvm::Value *ScratchPadElemAbsolutePtrVal =
3009  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3010  ScratchPadElemAbsolutePtrVal =
3011  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3012  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3013  C.getTypeAlignInChars(Private->getType()));
3014  IncrScratchpadDest = true;
3015  break;
3016  }
3017  case ScratchpadToThread: {
3018  // Step 1.1: Get the address for the src element in the scratchpad.
3019  // address = base + index * ElementSizeInChars.
3020  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3021  llvm::Value *CurrentOffset =
3022  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3023  llvm::Value *ScratchPadElemAbsolutePtrVal =
3024  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3025  ScratchPadElemAbsolutePtrVal =
3026  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3027  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3028  C.getTypeAlignInChars(Private->getType()));
3029  IncrScratchpadSrc = true;
3030 
3031  // Step 1.2: Create a temporary to store the element in the destination
3032  // Reduce list.
3033  DestElementPtrAddr =
3034  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
3035  DestElementAddr =
3036  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3037  UpdateDestListPtr = true;
3038  break;
3039  }
3040  }
3041 
3042  // Regardless of src and dest of copy, we emit the load of src
3043  // element as this is required in all directions
3044  SrcElementAddr = Bld.CreateElementBitCast(
3045  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3046  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3047  SrcElementAddr.getElementType());
3048 
3049  // Now that all active lanes have read the element in the
3050  // Reduce list, shuffle over the value from the remote lane.
3051  if (ShuffleInElement) {
3052  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3053  RemoteLaneOffset, Private->getExprLoc());
3054  } else {
3055  if (Private->getType()->isScalarType()) {
3056  llvm::Value *Elem =
3057  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3058  Private->getType(), Private->getExprLoc());
3059  // Store the source element value to the dest element address.
3060  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3061  Private->getType());
3062  } else {
3063  CGF.EmitAggregateCopy(
3064  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3065  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3067  }
3068  }
3069 
3070  // Step 3.1: Modify reference in dest Reduce list as needed.
3071  // Modifying the reference in Reduce list to point to the newly
3072  // created element. The element is live in the current function
3073  // scope and that of functions it invokes (i.e., reduce_function).
3074  // RemoteReduceData[i] = (void*)&RemoteElem
3075  if (UpdateDestListPtr) {
3077  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3078  DestElementPtrAddr, /*Volatile=*/false,
3079  C.VoidPtrTy);
3080  }
3081 
3082  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3083  // address of the next element in scratchpad memory, unless we're currently
3084  // processing the last one. Memory alignment is also taken care of here.
3085  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3086  llvm::Value *ScratchpadBasePtr =
3087  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3088  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3089  ScratchpadBasePtr = Bld.CreateNUWAdd(
3090  ScratchpadBasePtr,
3091  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3092 
3093  // Take care of global memory alignment for performance
3094  ScratchpadBasePtr = Bld.CreateNUWSub(
3095  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3096  ScratchpadBasePtr = Bld.CreateUDiv(
3097  ScratchpadBasePtr,
3098  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3099  ScratchpadBasePtr = Bld.CreateNUWAdd(
3100  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3101  ScratchpadBasePtr = Bld.CreateNUWMul(
3102  ScratchpadBasePtr,
3103  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3104 
3105  if (IncrScratchpadDest)
3106  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3107  else /* IncrScratchpadSrc = true */
3108  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3109  }
3110 
3111  ++Idx;
3112  }
3113 }
3114 
3115 /// This function emits a helper that gathers Reduce lists from the first
3116 /// lane of every active warp to lanes in the first warp.
3117 ///
3118 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3119 /// shared smem[warp_size];
3120 /// For all data entries D in reduce_data:
3121 /// sync
3122 /// If (I am the first lane in each warp)
3123 /// Copy my local D to smem[warp_id]
3124 /// sync
3125 /// if (I am the first warp)
3126 /// Copy smem[thread_id] to my local D
3128  ArrayRef<const Expr *> Privates,
3129  QualType ReductionArrayTy,
3130  SourceLocation Loc) {
3131  ASTContext &C = CGM.getContext();
3132  llvm::Module &M = CGM.getModule();
3133 
3134  // ReduceList: thread local Reduce list.
3135  // At the stage of the computation when this function is called, partially
3136  // aggregated values reside in the first lane of every active warp.
3137  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3139  // NumWarps: number of warps active in the parallel region. This could
3140  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3141  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3142  C.getIntTypeForBitwidth(32, /* Signed */ true),
3144  FunctionArgList Args;
3145  Args.push_back(&ReduceListArg);
3146  Args.push_back(&NumWarpsArg);
3147 
3148  const CGFunctionInfo &CGFI =
3150  auto *Fn = llvm::Function::Create(
3152  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
3153  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3154  Fn->setDoesNotRecurse();
3155  CodeGenFunction CGF(CGM);
3156  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3157 
3158  CGBuilderTy &Bld = CGF.Builder;
3159 
3160  // This array is used as a medium to transfer, one reduce element at a time,
3161  // the data from the first lane of every warp to lanes in the first warp
3162  // in order to perform the final step of a reduction in a parallel region
3163  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3164  // for reduced latency, as well as to have a distinct copy for concurrently
3165  // executing target regions. The array is declared with common linkage so
3166  // as to be shared across compilation units.
3167  StringRef TransferMediumName =
3168  "__openmp_nvptx_data_transfer_temporary_storage";
3169  llvm::GlobalVariable *TransferMedium =
3170  M.getGlobalVariable(TransferMediumName);
3171  if (!TransferMedium) {
3172  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3173  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3174  TransferMedium = new llvm::GlobalVariable(
3175  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3176  llvm::Constant::getNullValue(Ty), TransferMediumName,
3177  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3178  SharedAddressSpace);
3179  CGM.addCompilerUsedGlobal(TransferMedium);
3180  }
3181 
3182  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3183  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3184  // nvptx_lane_id = nvptx_id % warpsize
3185  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3186  // nvptx_warp_id = nvptx_id / warpsize
3187  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3188 
3189  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3190  Address LocalReduceList(
3192  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3193  C.VoidPtrTy, Loc),
3194  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3195  CGF.getPointerAlign());
3196 
3197  unsigned Idx = 0;
3198  for (const Expr *Private : Privates) {
3199  //
3200  // Warp master copies reduce element to transfer medium in __shared__
3201  // memory.
3202  //
3203  unsigned RealTySize =
3204  C.getTypeSizeInChars(Private->getType())
3205  .alignTo(C.getTypeAlignInChars(Private->getType()))
3206  .getQuantity();
3207  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3208  unsigned NumIters = RealTySize / TySize;
3209  if (NumIters == 0)
3210  continue;
3211  QualType CType = C.getIntTypeForBitwidth(
3212  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3213  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3214  CharUnits Align = CharUnits::fromQuantity(TySize);
3215  llvm::Value *Cnt = nullptr;
3216  Address CntAddr = Address::invalid();
3217  llvm::BasicBlock *PrecondBB = nullptr;
3218  llvm::BasicBlock *ExitBB = nullptr;
3219  if (NumIters > 1) {
3220  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3221  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3222  /*Volatile=*/false, C.IntTy);
3223  PrecondBB = CGF.createBasicBlock("precond");
3224  ExitBB = CGF.createBasicBlock("exit");
3225  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3226  // There is no need to emit line number for unconditional branch.
3228  CGF.EmitBlock(PrecondBB);
3229  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3230  llvm::Value *Cmp =
3231  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3232  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3233  CGF.EmitBlock(BodyBB);
3234  }
3235  // kmpc_barrier.
3236  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3237  /*EmitChecks=*/false,
3238  /*ForceSimpleCall=*/true);
3239  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3240  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3241  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3242 
3243  // if (lane_id == 0)
3244  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3245  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3246  CGF.EmitBlock(ThenBB);
3247 
3248  // Reduce element = LocalReduceList[i]
3249  Address ElemPtrPtrAddr =
3250  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3251  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3252  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3253  // elemptr = ((CopyType*)(elemptrptr)) + I
3254  Address ElemPtr = Address(ElemPtrPtr, Align);
3255  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3256  if (NumIters > 1) {
3257  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3258  ElemPtr.getAlignment());
3259  }
3260 
3261  // Get pointer to location in transfer medium.
3262  // MediumPtr = &medium[warp_id]
3263  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3264  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3265  Address MediumPtr(MediumPtrVal, Align);
3266  // Casting to actual data type.
3267  // MediumPtr = (CopyType*)MediumPtrAddr;
3268  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3269 
3270  // elem = *elemptr
3271  //*MediumPtr = elem
3272  llvm::Value *Elem =
3273  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3274  // Store the source element value to the dest element address.
3275  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3276 
3277  Bld.CreateBr(MergeBB);
3278 
3279  CGF.EmitBlock(ElseBB);
3280  Bld.CreateBr(MergeBB);
3281 
3282  CGF.EmitBlock(MergeBB);
3283 
3284  // kmpc_barrier.
3285  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3286  /*EmitChecks=*/false,
3287  /*ForceSimpleCall=*/true);
3288 
3289  //
3290  // Warp 0 copies reduce element from transfer medium.
3291  //
3292  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3293  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3294  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3295 
3296  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3297  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3298  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3299 
3300  // Up to 32 threads in warp 0 are active.
3301  llvm::Value *IsActiveThread =
3302  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3303  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3304 
3305  CGF.EmitBlock(W0ThenBB);
3306 
3307  // SrcMediumPtr = &medium[tid]
3308  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3309  TransferMedium,
3310  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3311  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3312  // SrcMediumVal = *SrcMediumPtr;
3313  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3314 
3315  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3316  Address TargetElemPtrPtr =
3317  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3318  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3319  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3320  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3321  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3322  if (NumIters > 1) {
3323  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3324  TargetElemPtr.getAlignment());
3325  }
3326 
3327  // *TargetElemPtr = SrcMediumVal;
3328  llvm::Value *SrcMediumValue =
3329  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3330  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3331  CType);
3332  Bld.CreateBr(W0MergeBB);
3333 
3334  CGF.EmitBlock(W0ElseBB);
3335  Bld.CreateBr(W0MergeBB);
3336 
3337  CGF.EmitBlock(W0MergeBB);
3338 
3339  if (NumIters > 1) {
3340  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3341  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3342  CGF.EmitBranch(PrecondBB);
3344  CGF.EmitBlock(ExitBB);
3345  }
3346  RealTySize %= TySize;
3347  }
3348  ++Idx;
3349  }
3350 
3351  CGF.FinishFunction();
3352  return Fn;
3353 }
3354 
3355 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3356 /// in the same warp. It uses shuffle instructions to copy over data from
3357 /// a remote lane's stack. The reduction algorithm performed is specified
3358 /// by the fourth parameter.
3359 ///
3360 /// Algorithm Versions.
3361 /// Full Warp Reduce (argument value 0):
3362 /// This algorithm assumes that all 32 lanes are active and gathers
3363 /// data from these 32 lanes, producing a single resultant value.
3364 /// Contiguous Partial Warp Reduce (argument value 1):
3365 /// This algorithm assumes that only a *contiguous* subset of lanes
3366 /// are active. This happens for the last warp in a parallel region
3367 /// when the user specified num_threads is not an integer multiple of
3368 /// 32. This contiguous subset always starts with the zeroth lane.
3369 /// Partial Warp Reduce (argument value 2):
3370 /// This algorithm gathers data from any number of lanes at any position.
3371 /// All reduced values are stored in the lowest possible lane. The set
3372 /// of problems every algorithm addresses is a super set of those
3373 /// addressable by algorithms with a lower version number. Overhead
3374 /// increases as algorithm version increases.
3375 ///
3376 /// Terminology
3377 /// Reduce element:
3378 /// Reduce element refers to the individual data field with primitive
3379 /// data types to be combined and reduced across threads.
3380 /// Reduce list:
3381 /// Reduce list refers to a collection of local, thread-private
3382 /// reduce elements.
3383 /// Remote Reduce list:
3384 /// Remote Reduce list refers to a collection of remote (relative to
3385 /// the current thread) reduce elements.
3386 ///
3387 /// We distinguish between three states of threads that are important to
3388 /// the implementation of this function.
3389 /// Alive threads:
3390 /// Threads in a warp executing the SIMT instruction, as distinguished from
3391 /// threads that are inactive due to divergent control flow.
3392 /// Active threads:
3393 /// The minimal set of threads that has to be alive upon entry to this
3394 /// function. The computation is correct iff active threads are alive.
3395 /// Some threads are alive but they are not active because they do not
3396 /// contribute to the computation in any useful manner. Turning them off
3397 /// may introduce control flow overheads without any tangible benefits.
3398 /// Effective threads:
3399 /// In order to comply with the argument requirements of the shuffle
3400 /// function, we must keep all lanes holding data alive. But at most
3401 /// half of them perform value aggregation; we refer to this half of
3402 /// threads as effective. The other half is simply handing off their
3403 /// data.
3404 ///
3405 /// Procedure
3406 /// Value shuffle:
3407 /// In this step active threads transfer data from higher lane positions
3408 /// in the warp to lower lane positions, creating Remote Reduce list.
3409 /// Value aggregation:
3410 /// In this step, effective threads combine their thread local Reduce list
3411 /// with Remote Reduce list and store the result in the thread local
3412 /// Reduce list.
3413 /// Value copy:
3414 /// In this step, we deal with the assumption made by algorithm 2
3415 /// (i.e. contiguity assumption). When we have an odd number of lanes
3416 /// active, say 2k+1, only k threads will be effective and therefore k
3417 /// new values will be produced. However, the Reduce list owned by the
3418 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3419 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3420 /// that the contiguity assumption still holds.
3422  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3423  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
3424  ASTContext &C = CGM.getContext();
3425 
3426  // Thread local Reduce list used to host the values of data to be reduced.
3427  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3429  // Current lane id; could be logical.
3430  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3432  // Offset of the remote source lane relative to the current lane.
3433  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3435  // Algorithm version. This is expected to be known at compile time.
3436  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3438  FunctionArgList Args;
3439  Args.push_back(&ReduceListArg);
3440  Args.push_back(&LaneIDArg);
3441  Args.push_back(&RemoteLaneOffsetArg);
3442  Args.push_back(&AlgoVerArg);
3443 
3444  const CGFunctionInfo &CGFI =
3446  auto *Fn = llvm::Function::Create(
3448  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3449  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3450  Fn->setDoesNotRecurse();
3451  CodeGenFunction CGF(CGM);
3452  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3453 
3454  CGBuilderTy &Bld = CGF.Builder;
3455 
3456  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3457  Address LocalReduceList(
3459  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3460  C.VoidPtrTy, SourceLocation()),
3461  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3462  CGF.getPointerAlign());
3463 
3464  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3465  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3466  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3467 
3468  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3469  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3470  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3471 
3472  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3473  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3474  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3475 
3476  // Create a local thread-private variable to host the Reduce list
3477  // from a remote lane.
3478  Address RemoteReduceList =
3479  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3480 
3481  // This loop iterates through the list of reduce elements and copies,
3482  // element by element, from a remote lane in the warp to RemoteReduceList,
3483  // hosted on the thread's stack.
3484  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3485  LocalReduceList, RemoteReduceList,
3486  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3487  /*ScratchpadIndex=*/nullptr,
3488  /*ScratchpadWidth=*/nullptr});
3489 
3490  // The actions to be performed on the Remote Reduce list is dependent
3491  // on the algorithm version.
3492  //
3493  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3494  // LaneId % 2 == 0 && Offset > 0):
3495  // do the reduction value aggregation
3496  //
3497  // The thread local variable Reduce list is mutated in place to host the
3498  // reduced data, which is the aggregated value produced from local and
3499  // remote lanes.
3500  //
3501  // Note that AlgoVer is expected to be a constant integer known at compile
3502  // time.
3503  // When AlgoVer==0, the first conjunction evaluates to true, making
3504  // the entire predicate true during compile time.
3505  // When AlgoVer==1, the second conjunction has only the second part to be
3506  // evaluated during runtime. Other conjunctions evaluates to false
3507  // during compile time.
3508  // When AlgoVer==2, the third conjunction has only the second part to be
3509  // evaluated during runtime. Other conjunctions evaluates to false
3510  // during compile time.
3511  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3512 
3513  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3514  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3515  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3516 
3517  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3518  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3519  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3520  CondAlgo2 = Bld.CreateAnd(
3521  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3522 
3523  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3524  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3525 
3526  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3527  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3528  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3529  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3530 
3531  CGF.EmitBlock(ThenBB);
3532  // reduce_function(LocalReduceList, RemoteReduceList)
3533  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3534  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3535  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3536  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3537  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3538  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3539  Bld.CreateBr(MergeBB);
3540 
3541  CGF.EmitBlock(ElseBB);
3542  Bld.CreateBr(MergeBB);
3543 
3544  CGF.EmitBlock(MergeBB);
3545 
3546  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3547  // Reduce list.
3548  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3549  llvm::Value *CondCopy = Bld.CreateAnd(
3550  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3551 
3552  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3553  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3554  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3555  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3556 
3557  CGF.EmitBlock(CpyThenBB);
3558  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3559  RemoteReduceList, LocalReduceList);
3560  Bld.CreateBr(CpyMergeBB);
3561 
3562  CGF.EmitBlock(CpyElseBB);
3563  Bld.CreateBr(CpyMergeBB);
3564 
3565  CGF.EmitBlock(CpyMergeBB);
3566 
3567  CGF.FinishFunction();
3568  return Fn;
3569 }
3570 
3571 ///
3572 /// Design of OpenMP reductions on the GPU
3573 ///
3574 /// Consider a typical OpenMP program with one or more reduction
3575 /// clauses:
3576 ///
3577 /// float foo;
3578 /// double bar;
3579 /// #pragma omp target teams distribute parallel for \
3580 /// reduction(+:foo) reduction(*:bar)
3581 /// for (int i = 0; i < N; i++) {
3582 /// foo += A[i]; bar *= B[i];
3583 /// }
3584 ///
3585 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3586 /// all teams. In our OpenMP implementation on the NVPTX device an
3587 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3588 /// within a team are mapped to CUDA threads within a threadblock.
3589 /// Our goal is to efficiently aggregate values across all OpenMP
3590 /// threads such that:
3591 ///
3592 /// - the compiler and runtime are logically concise, and
3593 /// - the reduction is performed efficiently in a hierarchical
3594 /// manner as follows: within OpenMP threads in the same warp,
3595 /// across warps in a threadblock, and finally across teams on
3596 /// the NVPTX device.
3597 ///
3598 /// Introduction to Decoupling
3599 ///
3600 /// We would like to decouple the compiler and the runtime so that the
3601 /// latter is ignorant of the reduction variables (number, data types)
3602 /// and the reduction operators. This allows a simpler interface
3603 /// and implementation while still attaining good performance.
3604 ///
3605 /// Pseudocode for the aforementioned OpenMP program generated by the
3606 /// compiler is as follows:
3607 ///
3608 /// 1. Create private copies of reduction variables on each OpenMP
3609 /// thread: 'foo_private', 'bar_private'
3610 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3611 /// to it and writes the result in 'foo_private' and 'bar_private'
3612 /// respectively.
3613 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3614 /// and store the result on the team master:
3615 ///
3616 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3617 /// reduceData, shuffleReduceFn, interWarpCpyFn)
3618 ///
3619 /// where:
3620 /// struct ReduceData {
3621 /// double *foo;
3622 /// double *bar;
3623 /// } reduceData
3624 /// reduceData.foo = &foo_private
3625 /// reduceData.bar = &bar_private
3626 ///
3627 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3628 /// auxiliary functions generated by the compiler that operate on
3629 /// variables of type 'ReduceData'. They aid the runtime perform
3630 /// algorithmic steps in a data agnostic manner.
3631 ///
3632 /// 'shuffleReduceFn' is a pointer to a function that reduces data
3633 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3634 /// same warp. It takes the following arguments as input:
3635 ///
3636 /// a. variable of type 'ReduceData' on the calling lane,
3637 /// b. its lane_id,
3638 /// c. an offset relative to the current lane_id to generate a
3639 /// remote_lane_id. The remote lane contains the second
3640 /// variable of type 'ReduceData' that is to be reduced.
3641 /// d. an algorithm version parameter determining which reduction
3642 /// algorithm to use.
3643 ///
3644 /// 'shuffleReduceFn' retrieves data from the remote lane using
3645 /// efficient GPU shuffle intrinsics and reduces, using the
3646 /// algorithm specified by the 4th parameter, the two operands
3647 /// element-wise. The result is written to the first operand.
3648 ///
3649 /// Different reduction algorithms are implemented in different
3650 /// runtime functions, all calling 'shuffleReduceFn' to perform
3651 /// the essential reduction step. Therefore, based on the 4th
3652 /// parameter, this function behaves slightly differently to
3653 /// cooperate with the runtime to ensure correctness under
3654 /// different circumstances.
3655 ///
3656 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3657 /// reduced variables across warps. It tunnels, through CUDA
3658 /// shared memory, the thread-private data of type 'ReduceData'
3659 /// from lane 0 of each warp to a lane in the first warp.
3660 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3661 /// The last team writes the global reduced value to memory.
3662 ///
3663 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3664 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3665 /// scratchpadCopyFn, loadAndReduceFn)
3666 ///
3667 /// 'scratchpadCopyFn' is a helper that stores reduced
3668 /// data from the team master to a scratchpad array in
3669 /// global memory.
3670 ///
3671 /// 'loadAndReduceFn' is a helper that loads data from
3672 /// the scratchpad array and reduces it with the input
3673 /// operand.
3674 ///
3675 /// These compiler generated functions hide address
3676 /// calculation and alignment information from the runtime.
3677 /// 5. if ret == 1:
3678 /// The team master of the last team stores the reduced
3679 /// result to the globals in memory.
3680 /// foo += reduceData.foo; bar *= reduceData.bar
3681 ///
3682 ///
3683 /// Warp Reduction Algorithms
3684 ///
3685 /// On the warp level, we have three algorithms implemented in the
3686 /// OpenMP runtime depending on the number of active lanes:
3687 ///
3688 /// Full Warp Reduction
3689 ///
3690 /// The reduce algorithm within a warp where all lanes are active
3691 /// is implemented in the runtime as follows:
3692 ///
3693 /// full_warp_reduce(void *reduce_data,
3694 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3695 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3696 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3697 /// }
3698 ///
3699 /// The algorithm completes in log(2, WARPSIZE) steps.
3700 ///
3701 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3702 /// not used therefore we save instructions by not retrieving lane_id
3703 /// from the corresponding special registers. The 4th parameter, which
3704 /// represents the version of the algorithm being used, is set to 0 to
3705 /// signify full warp reduction.
3706 ///
3707 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3708 ///
3709 /// #reduce_elem refers to an element in the local lane's data structure
3710 /// #remote_elem is retrieved from a remote lane
3711 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3712 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3713 ///
3714 /// Contiguous Partial Warp Reduction
3715 ///
3716 /// This reduce algorithm is used within a warp where only the first
3717 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3718 /// number of OpenMP threads in a parallel region is not a multiple of
3719 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3720 ///
3721 /// void
3722 /// contiguous_partial_reduce(void *reduce_data,
3723 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3724 /// int size, int lane_id) {
3725 /// int curr_size;
3726 /// int offset;
3727 /// curr_size = size;
3728 /// mask = curr_size/2;
3729 /// while (offset>0) {
3730 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3731 /// curr_size = (curr_size+1)/2;
3732 /// offset = curr_size/2;
3733 /// }
3734 /// }
3735 ///
3736 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3737 ///
3738 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3739 /// if (lane_id < offset)
3740 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3741 /// else
3742 /// reduce_elem = remote_elem
3743 ///
3744 /// This algorithm assumes that the data to be reduced are located in a
3745 /// contiguous subset of lanes starting from the first. When there is
3746 /// an odd number of active lanes, the data in the last lane is not
3747 /// aggregated with any other lane's dat but is instead copied over.
3748 ///
3749 /// Dispersed Partial Warp Reduction
3750 ///
3751 /// This algorithm is used within a warp when any discontiguous subset of
3752 /// lanes are active. It is used to implement the reduction operation
3753 /// across lanes in an OpenMP simd region or in a nested parallel region.
3754 ///
3755 /// void
3756 /// dispersed_partial_reduce(void *reduce_data,
3757 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3758 /// int size, remote_id;
3759 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3760 /// do {
3761 /// remote_id = next_active_lane_id_right_after_me();
3762 /// # the above function returns 0 of no active lane
3763 /// # is present right after the current lane.
3764 /// size = number_of_active_lanes_in_this_warp();
3765 /// logical_lane_id /= 2;
3766 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3767 /// remote_id-1-threadIdx.x, 2);
3768 /// } while (logical_lane_id % 2 == 0 && size > 1);
3769 /// }
3770 ///
3771 /// There is no assumption made about the initial state of the reduction.
3772 /// Any number of lanes (>=1) could be active at any position. The reduction
3773 /// result is returned in the first active lane.
3774 ///
3775 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3776 ///
3777 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3778 /// if (lane_id % 2 == 0 && offset > 0)
3779 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3780 /// else
3781 /// reduce_elem = remote_elem
3782 ///
3783 ///
3784 /// Intra-Team Reduction
3785 ///
3786 /// This function, as implemented in the runtime call
3787 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3788 /// threads in a team. It first reduces within a warp using the
3789 /// aforementioned algorithms. We then proceed to gather all such
3790 /// reduced values at the first warp.
3791 ///
3792 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3793 /// data from each of the "warp master" (zeroth lane of each warp, where
3794 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3795 /// a mathematical sense) the problem of reduction across warp masters in
3796 /// a block to the problem of warp reduction.
3797 ///
3798 ///
3799 /// Inter-Team Reduction
3800 ///
3801 /// Once a team has reduced its data to a single value, it is stored in
3802 /// a global scratchpad array. Since each team has a distinct slot, this
3803 /// can be done without locking.
3804 ///
3805 /// The last team to write to the scratchpad array proceeds to reduce the
3806 /// scratchpad array. One or more workers in the last team use the helper
3807 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3808 /// the k'th worker reduces every k'th element.
3809 ///
3810 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3811 /// reduce across workers and compute a globally reduced value.
3812 ///
3816  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3817  if (!CGF.HaveInsertPoint())
3818  return;
3819 
3820  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3821 #ifndef NDEBUG
3822  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3823 #endif
3824 
3825  if (Options.SimpleReduction) {
3826  assert(!TeamsReduction && !ParallelReduction &&
3827  "Invalid reduction selection in emitReduction.");
3828  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3829  ReductionOps, Options);
3830  return;
3831  }
3832 
3833  assert((TeamsReduction || ParallelReduction) &&
3834  "Invalid reduction selection in emitReduction.");
3835 
3836  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3837  // RedList, shuffle_reduce_func, interwarp_copy_func);
3838  // or
3839  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3840  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3841  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3842 
3843  llvm::Value *Res;
3844  if (ParallelReduction) {
3845  ASTContext &C = CGM.getContext();
3846  // 1. Build a list of reduction variables.
3847  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3848  auto Size = RHSExprs.size();
3849  for (const Expr *E : Privates) {
3850  if (E->getType()->isVariablyModifiedType())
3851  // Reserve place for array size.
3852  ++Size;
3853  }
3854  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3855  QualType ReductionArrayTy =
3857  /*IndexTypeQuals=*/0);
3858  Address ReductionList =
3859  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3860  auto IPriv = Privates.begin();
3861  unsigned Idx = 0;
3862  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3863  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3864  CGF.getPointerSize());
3865  CGF.Builder.CreateStore(
3867  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3868  Elem);
3869  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3870  // Store array size.
3871  ++Idx;
3872  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3873  CGF.getPointerSize());
3874  llvm::Value *Size = CGF.Builder.CreateIntCast(
3875  CGF.getVLASize(
3876  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3877  .NumElts,
3878  CGF.SizeTy, /*isSigned=*/false);
3879  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3880  Elem);
3881  }
3882  }
3883 
3884  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3886  ReductionList.getPointer(), CGF.VoidPtrTy);
3887  llvm::Value *ReductionFn = emitReductionFunction(
3888  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3889  Privates, LHSExprs, RHSExprs, ReductionOps);
3890  llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3891  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3892  llvm::Value *InterWarpCopyFn =
3893  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3894 
3895  llvm::Value *Args[] = {RTLoc,
3896  ThreadId,
3897  CGF.Builder.getInt32(RHSExprs.size()),
3898  ReductionArrayTySize,
3899  RL,
3900  ShuffleAndReduceFn,
3901  InterWarpCopyFn};
3902 
3904  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2),
3905  Args);
3906  } else {
3907  assert(TeamsReduction && "expected teams reduction.");
3908  std::string Name = getName({"reduction"});
3909  llvm::Value *Lock = getCriticalRegionLock(Name);
3910  llvm::Value *Args[] = {RTLoc, ThreadId, Lock};
3911  Res = CGF.EmitRuntimeCall(
3913  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple),
3914  Args);
3915  }
3916 
3917  // 5. Build if (res == 1)
3918  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3919  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3920  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3921  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3922  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3923 
3924  // 6. Build then branch: where we have reduced values in the master
3925  // thread in each team.
3926  // __kmpc_end_reduce{_nowait}(<gtid>);
3927  // break;
3928  CGF.EmitBlock(ThenBB);
3929 
3930  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3931  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3932  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3933  auto IPriv = Privates.begin();
3934  auto ILHS = LHSExprs.begin();
3935  auto IRHS = RHSExprs.begin();
3936  for (const Expr *E : ReductionOps) {
3937  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3938  cast<DeclRefExpr>(*IRHS));
3939  ++IPriv;
3940  ++ILHS;
3941  ++IRHS;
3942  }
3943  };
3944  if (ParallelReduction) {
3945  llvm::Value *EndArgs[] = {ThreadId};
3946  RegionCodeGenTy RCG(CodeGen);
3947  NVPTXActionTy Action(
3948  nullptr, llvm::None,
3949  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3950  EndArgs);
3951  RCG.setAction(Action);
3952  RCG(CGF);
3953  } else {
3954  assert(TeamsReduction && "expected teams reduction.");
3955  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3956  std::string Name = getName({"reduction"});
3957  llvm::Value *Lock = getCriticalRegionLock(Name);
3958  llvm::Value *EndArgs[] = {RTLoc, ThreadId, Lock};
3959  RegionCodeGenTy RCG(CodeGen);
3960  NVPTXActionTy Action(
3961  nullptr, llvm::None,
3963  OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple),
3964  EndArgs);
3965  RCG.setAction(Action);
3966  RCG(CGF);
3967  }
3968  // There is no need to emit line number for unconditional branch.
3970  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3971 }
3972 
3973 const VarDecl *
3975  const VarDecl *NativeParam) const {
3976  if (!NativeParam->getType()->isReferenceType())
3977  return NativeParam;
3978  QualType ArgType = NativeParam->getType();
3979  QualifierCollector QC;
3980  const Type *NonQualTy = QC.strip(ArgType);
3981  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3982  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3983  if (Attr->getCaptureKind() == OMPC_map) {
3984  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3986  }
3987  }
3988  ArgType = CGM.getContext().getPointerType(PointeeTy);
3989  QC.addRestrict();
3990  enum { NVPTX_local_addr = 5 };
3991  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3992  ArgType = QC.apply(CGM.getContext(), ArgType);
3993  if (isa<ImplicitParamDecl>(NativeParam))
3995  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3996  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3997  return ParmVarDecl::Create(
3998  CGM.getContext(),
3999  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4000  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4001  NativeParam->getIdentifier(), ArgType,
4002  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4003 }
4004 
4005 Address
4007  const VarDecl *NativeParam,
4008  const VarDecl *TargetParam) const {
4009  assert(NativeParam != TargetParam &&
4010  NativeParam->getType()->isReferenceType() &&
4011  "Native arg must not be the same as target arg.");
4012  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4013  QualType NativeParamType = NativeParam->getType();
4014  QualifierCollector QC;
4015  const Type *NonQualTy = QC.strip(NativeParamType);
4016  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4017  unsigned NativePointeeAddrSpace =
4018  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4019  QualType TargetTy = TargetParam->getType();
4020  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4021  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4022  // First cast to generic.
4024  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4025  /*AddrSpace=*/0));
4026  // Cast from generic to native address space.
4028  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4029  NativePointeeAddrSpace));
4030  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4031  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4032  NativeParamType);
4033  return NativeParamAddr;
4034 }
4035 
4037  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
4038  ArrayRef<llvm::Value *> Args) const {
4039  SmallVector<llvm::Value *, 4> TargetArgs;
4040  TargetArgs.reserve(Args.size());
4041  auto *FnType =
4042  cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
4043  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4044  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4045  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4046  break;
4047  }
4048  llvm::Type *TargetType = FnType->getParamType(I);
4049  llvm::Value *NativeArg = Args[I];
4050  if (!TargetType->isPointerTy()) {
4051  TargetArgs.emplace_back(NativeArg);
4052  continue;
4053  }
4055  NativeArg,
4056  NativeArg->getType()->getPointerElementType()->getPointerTo());
4057  TargetArgs.emplace_back(
4058  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4059  }
4060  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4061 }
4062 
4063 /// Emit function which wraps the outline parallel region
4064 /// and controls the arguments which are passed to this function.
4065 /// The wrapper ensures that the outlined function is called
4066 /// with the correct arguments when data is shared.
4067 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4068  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4069  ASTContext &Ctx = CGM.getContext();
4070  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4071 
4072  // Create a function that takes as argument the source thread.
4073  FunctionArgList WrapperArgs;
4074  QualType Int16QTy =
4075  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4076  QualType Int32QTy =
4077  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4078  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4079  /*Id=*/nullptr, Int16QTy,
4081  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4082  /*Id=*/nullptr, Int32QTy,
4084  WrapperArgs.emplace_back(&ParallelLevelArg);
4085  WrapperArgs.emplace_back(&WrapperArg);
4086 
4087  const CGFunctionInfo &CGFI =
4088  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4089 
4090  auto *Fn = llvm::Function::Create(
4092  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4093  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4094  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4095  Fn->setDoesNotRecurse();
4096 
4097  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4098  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4099  D.getBeginLoc(), D.getBeginLoc());
4100 
4101  const auto *RD = CS.getCapturedRecordDecl();
4102  auto CurField = RD->field_begin();
4103 
4104  Address ZeroAddr = CGF.CreateMemTemp(
4105  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4106  /*Name*/ ".zero.addr");
4107  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4108  // Get the array of arguments.
4110 
4111  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4112  Args.emplace_back(ZeroAddr.getPointer());
4113 
4114  CGBuilderTy &Bld = CGF.Builder;
4115  auto CI = CS.capture_begin();
4116 
4117  // Use global memory for data sharing.
4118  // Handle passing of global args to workers.
4119  Address GlobalArgs =
4120  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4121  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4122  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4123  CGF.EmitRuntimeCall(
4124  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4125  DataSharingArgs);
4126 
4127  // Retrieve the shared variables from the list of references returned
4128  // by the runtime. Pass the variables to the outlined function.
4129  Address SharedArgListAddress = Address::invalid();
4130  if (CS.capture_size() > 0 ||
4132  SharedArgListAddress = CGF.EmitLoadOfPointer(
4133  GlobalArgs, CGF.getContext()
4135  CGF.getContext().VoidPtrTy))
4136  .castAs<PointerType>());
4137  }
4138  unsigned Idx = 0;
4140  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4141  CGF.getPointerSize());
4142  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4143  Src, CGF.SizeTy->getPointerTo());
4144  llvm::Value *LB = CGF.EmitLoadOfScalar(
4145  TypedAddress,
4146  /*Volatile=*/false,
4148  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4149  Args.emplace_back(LB);
4150  ++Idx;
4151  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4152  CGF.getPointerSize());
4153  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4154  Src, CGF.SizeTy->getPointerTo());
4155  llvm::Value *UB = CGF.EmitLoadOfScalar(
4156  TypedAddress,
4157  /*Volatile=*/false,
4159  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4160  Args.emplace_back(UB);
4161  ++Idx;
4162  }
4163  if (CS.capture_size() > 0) {
4164  ASTContext &CGFContext = CGF.getContext();
4165  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4166  QualType ElemTy = CurField->getType();
4167  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
4168  CGF.getPointerSize());
4169  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4170  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4171  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4172  /*Volatile=*/false,
4173  CGFContext.getPointerType(ElemTy),
4174  CI->getLocation());
4175  if (CI->capturesVariableByCopy() &&
4176  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4177  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4178  CI->getLocation());
4179  }
4180  Args.emplace_back(Arg);
4181  }
4182  }
4183 
4184  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4185  CGF.FinishFunction();
4186  return Fn;
4187 }
4188 
4190  const Decl *D) {
4192  return;
4193 
4194  assert(D && "Expected function or captured|block decl.");
4195  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4196  "Function is registered already.");
4197  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4198  "Team is set but not processed.");
4199  const Stmt *Body = nullptr;
4200  bool NeedToDelayGlobalization = false;
4201  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4202  Body = FD->getBody();
4203  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4204  Body = BD->getBody();
4205  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4206  Body = CD->getBody();
4207  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4208  if (NeedToDelayGlobalization &&
4209  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4210  return;
4211  }
4212  if (!Body)
4213  return;
4214  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4215  VarChecker.Visit(Body);
4216  const RecordDecl *GlobalizedVarsRecord =
4217  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4218  TeamAndReductions.first = nullptr;
4219  TeamAndReductions.second.clear();
4220  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4221  VarChecker.getEscapedVariableLengthDecls();
4222  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4223  return;
4224  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4225  I->getSecond().MappedParams =
4226  llvm::make_unique<CodeGenFunction::OMPMapVars>();
4227  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4228  I->getSecond().EscapedParameters.insert(
4229  VarChecker.getEscapedParameters().begin(),
4230  VarChecker.getEscapedParameters().end());
4231  I->getSecond().EscapedVariableLengthDecls.append(
4232  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4233  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4234  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4235  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4236  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4237  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4238  }
4239  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4240  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4241  VarChecker.Visit(Body);
4242  I->getSecond().SecondaryGlobalRecord =
4243  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4244  I->getSecond().SecondaryLocalVarData.emplace();
4245  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4246  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4247  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4248  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4249  Data.insert(
4250  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4251  }
4252  }
4253  if (!NeedToDelayGlobalization) {
4254  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4255  struct GlobalizationScope final : EHScopeStack::Cleanup {
4256  GlobalizationScope() = default;
4257 
4258  void Emit(CodeGenFunction &CGF, Flags flags) override {
4259  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4260  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4261  }
4262  };
4263  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4264  }
4265 }
4266 
4268  const VarDecl *VD) {
4270  return Address::invalid();
4271 
4272  VD = VD->getCanonicalDecl();
4273  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4274  if (I == FunctionGlobalizedDecls.end())
4275  return Address::invalid();
4276  auto VDI = I->getSecond().LocalVarData.find(VD);
4277  if (VDI != I->getSecond().LocalVarData.end())
4278  return VDI->second.PrivateAddr;
4279  if (VD->hasAttrs()) {
4281  E(VD->attr_end());
4282  IT != E; ++IT) {
4283  auto VDI = I->getSecond().LocalVarData.find(
4284  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4285  ->getCanonicalDecl());
4286  if (VDI != I->getSecond().LocalVarData.end())
4287  return VDI->second.PrivateAddr;
4288  }
4289  }
4290  return Address::invalid();
4291 }
4292 
4294  FunctionGlobalizedDecls.erase(CGF.CurFn);
4296 }
4297 
4299  CodeGenFunction &CGF, const OMPLoopDirective &S,
4300  OpenMPDistScheduleClauseKind &ScheduleKind,
4301  llvm::Value *&Chunk) const {
4302  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4303  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4304  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4305  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4307  return;
4308  }
4310  CGF, S, ScheduleKind, Chunk);
4311 }
4312 
4314  CodeGenFunction &CGF, const OMPLoopDirective &S,
4315  OpenMPScheduleClauseKind &ScheduleKind,
4316  const Expr *&ChunkExpr) const {
4317  ScheduleKind = OMPC_SCHEDULE_static;
4318  // Chunk size is 1 in this case.
4319  llvm::APInt ChunkSize(32, 1);
4320  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4321  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4322  SourceLocation());
4323 }
4324 
4326  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4328  " Expected target-based directive.");
4329  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4330  for (const CapturedStmt::Capture &C : CS->captures()) {
4331  // Capture variables captured by reference in lambdas for target-based
4332  // directives.
4333  if (!C.capturesVariable())
4334  continue;
4335  const VarDecl *VD = C.getCapturedVar();
4336  const auto *RD = VD->getType()
4337  .getCanonicalType()
4339  ->getAsCXXRecordDecl();
4340  if (!RD || !RD->isLambda())
4341  continue;
4342  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4343  LValue VDLVal;
4344  if (VD->getType().getCanonicalType()->isReferenceType())
4345  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4346  else
4347  VDLVal = CGF.MakeAddrLValue(
4348  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4349  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4350  FieldDecl *ThisCapture = nullptr;
4351  RD->getCaptureFields(Captures, ThisCapture);
4352  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4353  LValue ThisLVal =
4354  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4355  llvm::Value *CXXThis = CGF.LoadCXXThis();
4356  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4357  }
4358  for (const LambdaCapture &LC : RD->captures()) {
4359  if (LC.getCaptureKind() != LCK_ByRef)
4360  continue;
4361  const VarDecl *VD = LC.getCapturedVar();
4362  if (!CS->capturesVariable(VD))
4363  continue;
4364  auto It = Captures.find(VD);
4365  assert(It != Captures.end() && "Found lambda capture without field.");
4366  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4367  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4368  if (VD->getType().getCanonicalType()->isReferenceType())
4369  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4370  VD->getType().getCanonicalType())
4371  .getAddress();
4372  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4373  }
4374  }
4375 }
4376 
4377 // Get current CudaArch and ignore any unknown values
4379  if (!CGM.getTarget().hasFeature("ptx"))
4380  return CudaArch::UNKNOWN;
4381  llvm::StringMap<bool> Features;
4382  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4383  CGM.getTarget().getTargetOpts().CPU,
4384  CGM.getTarget().getTargetOpts().Features);
4385  for (const auto &Feature : Features) {
4386  if (Feature.getValue()) {
4387  CudaArch Arch = StringToCudaArch(Feature.getKey());
4388  if (Arch != CudaArch::UNKNOWN)
4389  return Arch;
4390  }
4391  }
4392  return CudaArch::UNKNOWN;
4393 }
4394 
4395 /// Check to see if target architecture supports unified addressing which is
4396 /// a restriction for OpenMP requires clause "unified_shared_memory".
4398  CodeGenModule &CGM, const OMPRequiresDecl *D) const {
4399  for (const OMPClause *Clause : D->clauselists()) {
4400  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4401  switch (getCudaArch(CGM)) {
4402  case CudaArch::SM_20:
4403  case CudaArch::SM_21:
4404  case CudaArch::SM_30:
4405  case CudaArch::SM_32:
4406  case CudaArch::SM_35:
4407  case CudaArch::SM_37:
4408  case CudaArch::SM_50:
4409  case CudaArch::SM_52:
4410  case CudaArch::SM_53:
4411  case CudaArch::SM_60:
4412  case CudaArch::SM_61:
4413  case CudaArch::SM_62:
4414  CGM.Error(Clause->getBeginLoc(),
4415  "Target architecture does not support unified addressing");
4416  return;
4417  case CudaArch::SM_70:
4418  case CudaArch::SM_72:
4419  case CudaArch::SM_75:
4420  case CudaArch::GFX600:
4421  case CudaArch::GFX601:
4422  case CudaArch::GFX700:
4423  case CudaArch::GFX701:
4424  case CudaArch::GFX702:
4425  case CudaArch::GFX703:
4426  case CudaArch::GFX704:
4427  case CudaArch::GFX801:
4428  case CudaArch::GFX802:
4429  case CudaArch::GFX803:
4430  case CudaArch::GFX810:
4431  case CudaArch::GFX900:
4432  case CudaArch::GFX902:
4433  case CudaArch::GFX904:
4434  case CudaArch::GFX906:
4435  case CudaArch::GFX909:
4436  case CudaArch::UNKNOWN:
4437  break;
4438  case CudaArch::LAST:
4439  llvm_unreachable("Unexpected Cuda arch.");
4440  }
4441  }
4442  }
4443 }
4444 
4445 /// Get number of SMs and number of blocks per SM.
4446 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4447  std::pair<unsigned, unsigned> Data;
4448  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4449  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4450  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4451  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4452  if (Data.first && Data.second)
4453  return Data;
4454  switch (getCudaArch(CGM)) {
4455  case CudaArch::SM_20:
4456  case CudaArch::SM_21:
4457  case CudaArch::SM_30:
4458  case CudaArch::SM_32:
4459  case CudaArch::SM_35:
4460  case CudaArch::SM_37:
4461  case CudaArch::SM_50:
4462  case CudaArch::SM_52:
4463  case CudaArch::SM_53:
4464  return {16, 16};
4465  case CudaArch::SM_60:
4466  case CudaArch::SM_61:
4467  case CudaArch::SM_62:
4468  return {56, 32};
4469  case CudaArch::SM_70:
4470  case CudaArch::SM_72:
4471  case CudaArch::SM_75:
4472  return {84, 32};
4473  case CudaArch::GFX600:
4474  case CudaArch::GFX601:
4475  case CudaArch::GFX700:
4476  case CudaArch::GFX701:
4477  case CudaArch::GFX702:
4478  case CudaArch::GFX703:
4479  case CudaArch::GFX704:
4480  case CudaArch::GFX801:
4481  case CudaArch::GFX802:
4482  case CudaArch::GFX803:
4483  case CudaArch::GFX810:
4484  case CudaArch::GFX900:
4485  case CudaArch::GFX902:
4486  case CudaArch::GFX904:
4487  case CudaArch::GFX906:
4488  case CudaArch::GFX909:
4489  case CudaArch::UNKNOWN:
4490  break;
4491  case CudaArch::LAST:
4492  llvm_unreachable("Unexpected Cuda arch.");
4493  }
4494  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4495 }
4496 
4498  if (!GlobalizedRecords.empty()) {
4499  ASTContext &C = CGM.getContext();
4502  RecordDecl *StaticRD = C.buildImplicitRecord(
4503  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4504  StaticRD->startDefinition();
4505  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4506  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4507  SharedStaticRD->startDefinition();
4508  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4509  if (Records.Records.empty())
4510  continue;
4511  unsigned Size = 0;
4512  unsigned RecAlignment = 0;
4513  for (const RecordDecl *RD : Records.Records) {
4514  QualType RDTy = C.getRecordType(RD);
4515  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4516  RecAlignment = std::max(RecAlignment, Alignment);
4517  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4518  Size =
4519  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4520  }
4521  Size = llvm::alignTo(Size, RecAlignment);
4522  llvm::APInt ArySize(/*numBits=*/64, Size);
4523  QualType SubTy = C.getConstantArrayType(
4524  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
4525  const bool UseSharedMemory = Size <= SharedMemorySize;
4526  auto *Field =
4527  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4528  SourceLocation(), SourceLocation(), nullptr, SubTy,
4530  /*BW=*/nullptr, /*Mutable=*/false,
4531  /*InitStyle=*/ICIS_NoInit);
4532  Field->setAccess(AS_public);
4533  if (UseSharedMemory) {
4534  SharedStaticRD->addDecl(Field);
4535  SharedRecs.push_back(&Records);
4536  } else {
4537  StaticRD->addDecl(Field);
4538  GlobalRecs.push_back(&Records);
4539  }
4540  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4541  Records.UseSharedMemory->setInitializer(
4542  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4543  }
4544  // Allocate SharedMemorySize buffer for the shared memory.
4545  // FIXME: nvlink does not handle weak linkage correctly (object with the
4546  // different size are reported as erroneous).
4547  // Restore this code as sson as nvlink is fixed.
4548  if (!SharedStaticRD->field_empty()) {
4549  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4550  QualType SubTy = C.getConstantArrayType(
4551  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
4552  auto *Field = FieldDecl::Create(
4553  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4555  /*BW=*/nullptr, /*Mutable=*/false,
4556  /*InitStyle=*/ICIS_NoInit);
4557  Field->setAccess(AS_public);
4558  SharedStaticRD->addDecl(Field);
4559  }
4560  SharedStaticRD->completeDefinition();
4561  if (!SharedStaticRD->field_empty()) {
4562  QualType StaticTy = C.getRecordType(SharedStaticRD);
4563  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4564  auto *GV = new llvm::GlobalVariable(
4565  CGM.getModule(), LLVMStaticTy,
4566  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
4567  llvm::Constant::getNullValue(LLVMStaticTy),
4568  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4569  llvm::GlobalValue::NotThreadLocal,
4571  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4572  GV, CGM.VoidPtrTy);
4573  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4574  Rec->Buffer->replaceAllUsesWith(Replacement);
4575  Rec->Buffer->eraseFromParent();
4576  }
4577  }
4578  StaticRD->completeDefinition();
4579  if (!StaticRD->field_empty()) {
4580  QualType StaticTy = C.getRecordType(StaticRD);
4581  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4582  llvm::APInt Size1(32, SMsBlockPerSM.second);
4583  QualType Arr1Ty =
4584  C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
4585  /*IndexTypeQuals=*/0);
4586  llvm::APInt Size2(32, SMsBlockPerSM.first);
4587  QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
4588  /*IndexTypeQuals=*/0);
4589  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4590  auto *GV = new llvm::GlobalVariable(
4591  CGM.getModule(), LLVMArr2Ty,
4592  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
4593  llvm::Constant::getNullValue(LLVMArr2Ty),
4594  "_openmp_static_glob_rd_$_");
4595  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4596  GV, CGM.VoidPtrTy);
4597  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4598  Rec->Buffer->replaceAllUsesWith(Replacement);
4599  Rec->Buffer->eraseFromParent();
4600  }
4601  }
4602  }
4604 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:659
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5196
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:152
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:40
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1510
A class which contains all the information about a particular captured value.
Definition: Decl.h:3864
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2537
CanQualType VoidPtrTy
Definition: ASTContext.h:1044
A (possibly-)qualified type.
Definition: Type.h:638
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:260
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Address CreateConstGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:494
Stmt - This represents one statement.
Definition: Stmt.h:66
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
bool hasNonTrivialCall(const ASTContext &Ctx) const
Determine whether this expression involves a call to any function that is not trivial.
Definition: Expr.cpp:3426
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:87
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:35
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:168
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:410
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:240
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:226
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6245
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of &#39;this&#39;.
The base class of the type hierarchy.
Definition: Type.h:1407
virtual void clear()
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4154
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2322
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference...
Definition: CGExpr.cpp:3985
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling...
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:26
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false)
QualType getElementType() const
Definition: Type.h:2847
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1289
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:103
Represents a variable declaration or definition.
Definition: Decl.h:813
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:67
This represents &#39;num_threads&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:382
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
The "union" keyword.
Definition: Type.h:5039
const ArrayType * castAsArrayTypeUnsafe() const
A variant of castAs<> for array type which silently discards qualifiers from the outermost type...
Definition: Type.h:6820
bool field_empty() const
Definition: Decl.h:3792
DiagnosticsEngine & getDiags() const
llvm::Value * getPointer() const
Definition: Address.h:38
llvm::Type * ConvertTypeForMem(QualType T)
ConvertTypeForMem - Convert type T into a llvm::Type.
std::string getName(ArrayRef< StringRef > Parts) const
Get the platform-specific name separator.
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:57
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a struct/union/class.
Definition: Decl.h:3593
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
clauselist_range clauselists()
Definition: DeclOpenMP.h:295
Address getAddress() const
Definition: CGValue.h:327
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:155
attr_iterator attr_end() const
Definition: DeclBase.h:497
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Represents a member of a struct/union/class.
Definition: Decl.h:2579
This represents clause &#39;lastprivate&#39; in the &#39;#pragma omp ...&#39; directives.
CharUnits getAlignment() const
Definition: CGValue.h:316
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:209
unsigned getDefaultLocationReserved2Flags() const override
Returns additional flags that can be stored in reserved_2 field of the default location.
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:3877
bool isReferenceType() const
Definition: Type.h:6308
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:244
Expr * getSubExpr()
Definition: Expr.h:3055
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:739
static bool hasNestedLightweightDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) lightweight runtime construct, if any.
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
This is a common base class for loop directives (&#39;omp simd&#39;, &#39;omp for&#39;, &#39;omp for simd&#39; etc...
Definition: StmtOpenMP.h:338
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
OpenMPDistScheduleClauseKind
OpenMP attributes for &#39;dist_schedule&#39; clause.
Definition: OpenMPKinds.h:109
bool EvaluateAsBooleanCondition(bool &Result, const ASTContext &Ctx) const
EvaluateAsBooleanCondition - Return true if this is a constant which we can fold and convert to a boo...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:157
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:67
child_range children()
Definition: Stmt.cpp:237
Expr * getIterationVariable() const
Definition: StmtOpenMP.h:785
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
ModeFlagsTy
Enum for accesseing the reserved_2 field of the ident_t struct.
bool isConstexpr() const
Whether this variable is (C++11) constexpr.
Definition: Decl.h:1382
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Value * emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
This represents clause &#39;reduction&#39; in the &#39;#pragma omp ...&#39; directives.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
bool isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a worksharing directive.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1649
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value &#39;V&#39; and type &#39;type&#39;.
Definition: Expr.cpp:792
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:1153
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:870
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:3857
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4404
Unknown execution mode (orphaned directive).
static CharUnits One()
One - Construct a CharUnits quantity of one.
Definition: CharUnits.h:58
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1613
llvm::CallInst * EmitNounwindRuntimeCall(llvm::Value *callee, const Twine &name="")
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:3118
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
void setAddress(Address address)
Definition: CGValue.h:328
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of lastprivate variables from the teams distribute ...
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
This represents &#39;#pragma omp requires...&#39; directive.
Definition: DeclOpenMP.h:250
unsigned Offset
Definition: Format.cpp:1631
bool HasSideEffects(const ASTContext &Ctx, bool IncludePossibleEffects=true) const
HasSideEffects - This routine returns true for all those expressions which have any effect other than...
Definition: Expr.cpp:3101
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:196
virtual bool initFeatureMap(llvm::StringMap< bool > &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector< std::string > &FeatureVec) const
Initialize the map with the default set of target features for the CPU this should include all legal ...
Definition: TargetInfo.cpp:386
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:637
This represents one expression.
Definition: Expr.h:106
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:147
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:701
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5182
Expr * getCallee()
Definition: Expr.h:2514
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:427
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:130
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1025
This represents &#39;ordered&#39; clause in the &#39;#pragma omp ...&#39; directive.
QualType getType() const
Definition: Expr.h:128
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:1896
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:50
Allow UB that we can give a value, but not arbitrary unmodeled side effects.
Definition: Expr.h:597
const TargetInfo & getTarget() const
ValueDecl * getDecl()
Definition: Expr.h:1114
const LangOptions & getLangOpts() const
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for &#39;proc_bind&#39; clause.
Definition: OpenMPKinds.h:51
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2026
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:35
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:162
AttrVec & getAttrs()
Definition: DeclBase.h:479
bool hasAttrs() const
Definition: DeclBase.h:473
std::string CPU
If given, the name of the target CPU to generate code for.
Definition: TargetOptions.h:37
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Kind
This captures a statement into a function.
Definition: Stmt.h:3105
QualType getCanonicalType() const
Definition: Type.h:6111
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:1926
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2095
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
CastKind getCastKind() const
Definition: Expr.h:3049
This represents &#39;schedule&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:948
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:1143
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
std::vector< std::string > Features
The list of target specific features to enable or disable – this should be a list of strings startin...
Definition: TargetOptions.h:56
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:51
CanQualType VoidTy
Definition: ASTContext.h:1016
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2590
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3120
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3206
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:249
llvm::Value * getCriticalRegionLock(StringRef CriticalName)
Returns corresponding lock object for the specified critical region name.
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
QualType getType() const
Definition: CGValue.h:264
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can&#39;t be done.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
CanQualType CharTy
Definition: ASTContext.h:1018
static CudaArch getCudaArch(CodeGenModule &CGM)
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2535
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2313
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6039
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1262
ArrayRef< Capture > captures() const
Definition: Decl.h:3985
static bool isTrivial(ASTContext &Ctx, const Expr *E)
Checks if the expression is constant or does not have non-trivial function calls. ...
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6046
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:211
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1391
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3369
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1085
Expr * IgnoreParenImpCasts() LLVM_READONLY
IgnoreParenImpCasts - Ignore parentheses and implicit casts.
Definition: Expr.cpp:2693
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void checkArchForUnifiedAddressing(CodeGenModule &CGM, const OMPRequiresDecl *D) const override
Perform check on requires decl to ensure that target architecture supports unified addressing...
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:195
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6578
void addRestrict()
Definition: Type.h:271
T * getAttr() const
Definition: DeclBase.h:527
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:52
OpenMPScheduleClauseKind
OpenMP attributes for &#39;schedule&#39; clause.
Definition: OpenMPKinds.h:59
Opcode getOpcode() const
Definition: Expr.h:1921
decl_range decls()
Definition: Stmt.h:1186
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:32
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:443
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1507
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:193
ExecutionMode
Defines the execution mode.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
bool isLValueReferenceType() const
Definition: Type.h:6312
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1265
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g., it is an signed integer type or a vector.
Definition: Type.cpp:1874
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:463
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:151
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Capturing by reference.
Definition: Lambda.h:38
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1236
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
void addAttr(Attr *A)
Definition: DeclBase.cpp:840
capture_range captures() const
Retrieve this lambda&#39;s captures.
Definition: ExprCXX.cpp:1166
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2396
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
CGCapturedStmtInfo * CapturedStmtInfo
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2416
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
__DEVICE__ int max(int __a, int __b)
CanQualType IntTy
Definition: ASTContext.h:1025
llvm::Value * RemoteLaneOffset
bool isEvaluatable(const ASTContext &Ctx, SideEffectsKind AllowSideEffects=SE_NoSideEffects) const
isEvaluatable - Call EvaluateAsRValue to see if this expression can be constant folded without side-e...
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:1823
capture_range captures()
Definition: Stmt.h:3240
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1041
void addAddressSpace(LangAS space)
Definition: Type.h:378
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:732
QualType getType() const
Definition: Decl.h:648
bool isOpenMPLoopDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a directive with an associated loop construct.
LValue - This represents an lvalue references.
Definition: CGValue.h:167
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:147
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
void setAccess(AccessSpecifier AS)
Definition: DeclBase.h:457
CanQualType BoolTy
Definition: ASTContext.h:1017
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2499
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:3762
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:164
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
No in-class initializer.
Definition: Specifiers.h:230
llvm::Value * getPointer() const
Definition: CGValue.h:323
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
Definition: Attr.h:44
SourceLocation getLocation() const
Definition: DeclBase.h:418
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
Expr * IgnoreParens() LLVM_READONLY
IgnoreParens - Ignore parentheses.
Definition: Expr.cpp:2560
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1550
static bool supportsLightweightRuntime(ASTContext &Ctx, const OMPExecutableDirective &D)
Checks if the construct supports lightweight runtime.