clang  10.0.0git
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
10 // targets.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeNVPTX.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "clang/Basic/Cuda.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/IR/IntrinsicsNVPTX.h"
23 
24 using namespace clang;
25 using namespace CodeGen;
26 using namespace llvm::omp;
27 
28 namespace {
30  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
31  /// int16_t RequiresOMPRuntime);
32  OMPRTL_NVPTX__kmpc_kernel_init,
33  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
34  OMPRTL_NVPTX__kmpc_kernel_deinit,
35  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
36  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
37  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
38  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
39  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
40  /// Call to void __kmpc_kernel_prepare_parallel(void
41  /// *outlined_function, int16_t
42  /// IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
44  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
45  /// int16_t IsOMPRuntimeInitialized);
46  OMPRTL_NVPTX__kmpc_kernel_parallel,
47  /// Call to void __kmpc_kernel_end_parallel();
48  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
49  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_serialized_parallel,
52  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
53  /// global_tid);
54  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
55  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int32,
58  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
59  /// int16_t lane_offset, int16_t warp_size);
60  OMPRTL_NVPTX__kmpc_shuffle_int64,
61  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
62  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
63  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
64  /// lane_offset, int16_t shortCircuit),
65  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
66  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
67  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
68  /// global_tid, void *global_buffer, int32_t num_of_records, void*
69  /// reduce_data,
70  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
71  /// lane_offset, int16_t shortCircuit),
72  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
73  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
74  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
75  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
76  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
77  /// *buffer, int idx, void *reduce_data));
78  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
79  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
80  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
81  /// Call to void __kmpc_data_sharing_init_stack();
82  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
83  /// Call to void __kmpc_data_sharing_init_stack_spmd();
84  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
85  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
86  /// int16_t UseSharedMemory);
87  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
88  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
89  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
90  /// Call to void __kmpc_begin_sharing_variables(void ***args,
91  /// size_t n_args);
92  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
93  /// Call to void __kmpc_end_sharing_variables();
94  OMPRTL_NVPTX__kmpc_end_sharing_variables,
95  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
96  OMPRTL_NVPTX__kmpc_get_shared_variables,
97  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
98  /// global_tid);
99  OMPRTL_NVPTX__kmpc_parallel_level,
100  /// Call to int8_t __kmpc_is_spmd_exec_mode();
101  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
102  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
103  /// const void *buf, size_t size, int16_t is_shared, const void **res);
104  OMPRTL_NVPTX__kmpc_get_team_static_memory,
105  /// Call to void __kmpc_restore_team_static_memory(int16_t
106  /// isSPMDExecutionMode, int16_t is_shared);
107  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
108  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
110  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
111  /// global_tid);
112  OMPRTL__kmpc_barrier_simple_spmd,
113  /// Call to int32_t __kmpc_warp_active_thread_mask(void);
114  OMPRTL_NVPTX__kmpc_warp_active_thread_mask,
115  /// Call to void __kmpc_syncwarp(int32_t Mask);
116  OMPRTL_NVPTX__kmpc_syncwarp,
117 };
118 
119 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
120 class NVPTXActionTy final : public PrePostActionTy {
121  llvm::FunctionCallee EnterCallee = nullptr;
122  ArrayRef<llvm::Value *> EnterArgs;
123  llvm::FunctionCallee ExitCallee = nullptr;
124  ArrayRef<llvm::Value *> ExitArgs;
125  bool Conditional = false;
126  llvm::BasicBlock *ContBlock = nullptr;
127 
128 public:
129  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
130  ArrayRef<llvm::Value *> EnterArgs,
131  llvm::FunctionCallee ExitCallee,
132  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
133  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
134  ExitArgs(ExitArgs), Conditional(Conditional) {}
135  void Enter(CodeGenFunction &CGF) override {
136  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
137  if (Conditional) {
138  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
139  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
140  ContBlock = CGF.createBasicBlock("omp_if.end");
141  // Generate the branch (If-stmt)
142  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
143  CGF.EmitBlock(ThenBlock);
144  }
145  }
146  void Done(CodeGenFunction &CGF) {
147  // Emit the rest of blocks/branches
148  CGF.EmitBranch(ContBlock);
149  CGF.EmitBlock(ContBlock, true);
150  }
151  void Exit(CodeGenFunction &CGF) override {
152  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
153  }
154 };
155 
156 /// A class to track the execution mode when codegening directives within
157 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
158 /// to the target region and used by containing directives such as 'parallel'
159 /// to emit optimized code.
160 class ExecutionRuntimeModesRAII {
161 private:
165  bool SavedRuntimeMode = false;
166  bool *RuntimeMode = nullptr;
167 
168 public:
169  /// Constructor for Non-SPMD mode.
170  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
171  : ExecMode(ExecMode) {
172  SavedExecMode = ExecMode;
174  }
175  /// Constructor for SPMD mode.
176  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
177  bool &RuntimeMode, bool FullRuntimeMode)
178  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
179  SavedExecMode = ExecMode;
180  SavedRuntimeMode = RuntimeMode;
182  RuntimeMode = FullRuntimeMode;
183  }
184  ~ExecutionRuntimeModesRAII() {
185  ExecMode = SavedExecMode;
186  if (RuntimeMode)
187  *RuntimeMode = SavedRuntimeMode;
188  }
189 };
190 
191 /// GPU Configuration: This information can be derived from cuda registers,
192 /// however, providing compile time constants helps generate more efficient
193 /// code. For all practical purposes this is fine because the configuration
194 /// is the same for all known NVPTX architectures.
195 enum MachineConfiguration : unsigned {
196  WarpSize = 32,
197  /// Number of bits required to represent a lane identifier, which is
198  /// computed as log_2(WarpSize).
199  LaneIDBits = 5,
200  LaneIDMask = WarpSize - 1,
201 
202  /// Global memory alignment for performance.
203  GlobalMemoryAlignment = 128,
204 
205  /// Maximal size of the shared memory buffer.
206  SharedMemorySize = 128,
207 };
208 
209 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
210  RefExpr = RefExpr->IgnoreParens();
211  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
212  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
213  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
214  Base = TempASE->getBase()->IgnoreParenImpCasts();
215  RefExpr = Base;
216  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
217  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
218  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
219  Base = TempOASE->getBase()->IgnoreParenImpCasts();
220  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
221  Base = TempASE->getBase()->IgnoreParenImpCasts();
222  RefExpr = Base;
223  }
224  RefExpr = RefExpr->IgnoreParenImpCasts();
225  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
226  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
227  const auto *ME = cast<MemberExpr>(RefExpr);
228  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
229 }
230 
231 
232 static RecordDecl *buildRecordForGlobalizedVars(
233  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
234  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
235  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
236  &MappedDeclsFields, int BufSize) {
237  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
238  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
239  return nullptr;
240  SmallVector<VarsDataTy, 4> GlobalizedVars;
241  for (const ValueDecl *D : EscapedDecls)
242  GlobalizedVars.emplace_back(
244  C.getDeclAlign(D).getQuantity(),
245  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
246  D);
247  for (const ValueDecl *D : EscapedDeclsForTeams)
248  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
249  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
250  return L.first > R.first;
251  });
252 
253  // Build struct _globalized_locals_ty {
254  // /* globalized vars */[WarSize] align (max(decl_align,
255  // GlobalMemoryAlignment))
256  // /* globalized vars */ for EscapedDeclsForTeams
257  // };
258  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
259  GlobalizedRD->startDefinition();
260  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
261  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
262  for (const auto &Pair : GlobalizedVars) {
263  const ValueDecl *VD = Pair.second;
264  QualType Type = VD->getType();
265  if (Type->isLValueReferenceType())
266  Type = C.getPointerType(Type.getNonReferenceType());
267  else
268  Type = Type.getNonReferenceType();
269  SourceLocation Loc = VD->getLocation();
270  FieldDecl *Field;
271  if (SingleEscaped.count(VD)) {
272  Field = FieldDecl::Create(
273  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
275  /*BW=*/nullptr, /*Mutable=*/false,
276  /*InitStyle=*/ICIS_NoInit);
277  Field->setAccess(AS_public);
278  if (VD->hasAttrs()) {
279  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
280  E(VD->getAttrs().end());
281  I != E; ++I)
282  Field->addAttr(*I);
283  }
284  } else {
285  llvm::APInt ArraySize(32, BufSize);
286  Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
287  0);
288  Field = FieldDecl::Create(
289  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
291  /*BW=*/nullptr, /*Mutable=*/false,
292  /*InitStyle=*/ICIS_NoInit);
293  Field->setAccess(AS_public);
294  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
295  static_cast<CharUnits::QuantityType>(
296  GlobalMemoryAlignment)));
297  Field->addAttr(AlignedAttr::CreateImplicit(
298  C, /*IsAlignmentExpr=*/true,
299  IntegerLiteral::Create(C, Align,
300  C.getIntTypeForBitwidth(32, /*Signed=*/0),
301  SourceLocation()),
302  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
303  }
304  GlobalizedRD->addDecl(Field);
305  MappedDeclsFields.try_emplace(VD, Field);
306  }
307  GlobalizedRD->completeDefinition();
308  return GlobalizedRD;
309 }
310 
311 /// Get the list of variables that can escape their declaration context.
312 class CheckVarsEscapingDeclContext final
313  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
314  CodeGenFunction &CGF;
315  llvm::SetVector<const ValueDecl *> EscapedDecls;
316  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
317  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
318  RecordDecl *GlobalizedRD = nullptr;
319  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
320  bool AllEscaped = false;
321  bool IsForCombinedParallelRegion = false;
322 
323  void markAsEscaped(const ValueDecl *VD) {
324  // Do not globalize declare target variables.
325  if (!isa<VarDecl>(VD) ||
326  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
327  return;
328  VD = cast<ValueDecl>(VD->getCanonicalDecl());
329  // Use user-specified allocation.
330  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
331  return;
332  // Variables captured by value must be globalized.
333  if (auto *CSI = CGF.CapturedStmtInfo) {
334  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
335  // Check if need to capture the variable that was already captured by
336  // value in the outer region.
337  if (!IsForCombinedParallelRegion) {
338  if (!FD->hasAttrs())
339  return;
340  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
341  if (!Attr)
342  return;
343  if (((Attr->getCaptureKind() != OMPC_map) &&
345  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
346  ((Attr->getCaptureKind() == OMPC_map) &&
347  !FD->getType()->isAnyPointerType()))
348  return;
349  }
350  if (!FD->getType()->isReferenceType()) {
351  assert(!VD->getType()->isVariablyModifiedType() &&
352  "Parameter captured by value with variably modified type");
353  EscapedParameters.insert(VD);
354  } else if (!IsForCombinedParallelRegion) {
355  return;
356  }
357  }
358  }
359  if ((!CGF.CapturedStmtInfo ||
360  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
361  VD->getType()->isReferenceType())
362  // Do not globalize variables with reference type.
363  return;
364  if (VD->getType()->isVariablyModifiedType())
365  EscapedVariableLengthDecls.insert(VD);
366  else
367  EscapedDecls.insert(VD);
368  }
369 
370  void VisitValueDecl(const ValueDecl *VD) {
371  if (VD->getType()->isLValueReferenceType())
372  markAsEscaped(VD);
373  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
374  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
375  const bool SavedAllEscaped = AllEscaped;
376  AllEscaped = VD->getType()->isLValueReferenceType();
377  Visit(VarD->getInit());
378  AllEscaped = SavedAllEscaped;
379  }
380  }
381  }
382  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
383  ArrayRef<OMPClause *> Clauses,
384  bool IsCombinedParallelRegion) {
385  if (!S)
386  return;
387  for (const CapturedStmt::Capture &C : S->captures()) {
388  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
389  const ValueDecl *VD = C.getCapturedVar();
390  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
391  if (IsCombinedParallelRegion) {
392  // Check if the variable is privatized in the combined construct and
393  // those private copies must be shared in the inner parallel
394  // directive.
395  IsForCombinedParallelRegion = false;
396  for (const OMPClause *C : Clauses) {
397  if (!isOpenMPPrivate(C->getClauseKind()) ||
398  C->getClauseKind() == OMPC_reduction ||
399  C->getClauseKind() == OMPC_linear ||
400  C->getClauseKind() == OMPC_private)
401  continue;
403  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
404  Vars = PC->getVarRefs();
405  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
406  Vars = PC->getVarRefs();
407  else
408  llvm_unreachable("Unexpected clause.");
409  for (const auto *E : Vars) {
410  const Decl *D =
411  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
412  if (D == VD->getCanonicalDecl()) {
413  IsForCombinedParallelRegion = true;
414  break;
415  }
416  }
417  if (IsForCombinedParallelRegion)
418  break;
419  }
420  }
421  markAsEscaped(VD);
422  if (isa<OMPCapturedExprDecl>(VD))
423  VisitValueDecl(VD);
424  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
425  }
426  }
427  }
428 
429  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
430  assert(!GlobalizedRD &&
431  "Record for globalized variables is built already.");
432  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
433  if (IsInTTDRegion)
434  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
435  else
436  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
437  GlobalizedRD = ::buildRecordForGlobalizedVars(
438  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
439  MappedDeclsFields, WarpSize);
440  }
441 
442 public:
443  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
444  ArrayRef<const ValueDecl *> TeamsReductions)
445  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
446  }
447  virtual ~CheckVarsEscapingDeclContext() = default;
448  void VisitDeclStmt(const DeclStmt *S) {
449  if (!S)
450  return;
451  for (const Decl *D : S->decls())
452  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
453  VisitValueDecl(VD);
454  }
455  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
456  if (!D)
457  return;
458  if (!D->hasAssociatedStmt())
459  return;
460  if (const auto *S =
461  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
462  // Do not analyze directives that do not actually require capturing,
463  // like `omp for` or `omp simd` directives.
465  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
466  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
467  VisitStmt(S->getCapturedStmt());
468  return;
469  }
470  VisitOpenMPCapturedStmt(
471  S, D->clauses(),
472  CaptureRegions.back() == OMPD_parallel &&
474  }
475  }
476  void VisitCapturedStmt(const CapturedStmt *S) {
477  if (!S)
478  return;
479  for (const CapturedStmt::Capture &C : S->captures()) {
480  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
481  const ValueDecl *VD = C.getCapturedVar();
482  markAsEscaped(VD);
483  if (isa<OMPCapturedExprDecl>(VD))
484  VisitValueDecl(VD);
485  }
486  }
487  }
488  void VisitLambdaExpr(const LambdaExpr *E) {
489  if (!E)
490  return;
491  for (const LambdaCapture &C : E->captures()) {
492  if (C.capturesVariable()) {
493  if (C.getCaptureKind() == LCK_ByRef) {
494  const ValueDecl *VD = C.getCapturedVar();
495  markAsEscaped(VD);
496  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
497  VisitValueDecl(VD);
498  }
499  }
500  }
501  }
502  void VisitBlockExpr(const BlockExpr *E) {
503  if (!E)
504  return;
505  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
506  if (C.isByRef()) {
507  const VarDecl *VD = C.getVariable();
508  markAsEscaped(VD);
509  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
510  VisitValueDecl(VD);
511  }
512  }
513  }
514  void VisitCallExpr(const CallExpr *E) {
515  if (!E)
516  return;
517  for (const Expr *Arg : E->arguments()) {
518  if (!Arg)
519  continue;
520  if (Arg->isLValue()) {
521  const bool SavedAllEscaped = AllEscaped;
522  AllEscaped = true;
523  Visit(Arg);
524  AllEscaped = SavedAllEscaped;
525  } else {
526  Visit(Arg);
527  }
528  }
529  Visit(E->getCallee());
530  }
531  void VisitDeclRefExpr(const DeclRefExpr *E) {
532  if (!E)
533  return;
534  const ValueDecl *VD = E->getDecl();
535  if (AllEscaped)
536  markAsEscaped(VD);
537  if (isa<OMPCapturedExprDecl>(VD))
538  VisitValueDecl(VD);
539  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
540  if (VarD->isInitCapture())
541  VisitValueDecl(VD);
542  }
543  void VisitUnaryOperator(const UnaryOperator *E) {
544  if (!E)
545  return;
546  if (E->getOpcode() == UO_AddrOf) {
547  const bool SavedAllEscaped = AllEscaped;
548  AllEscaped = true;
549  Visit(E->getSubExpr());
550  AllEscaped = SavedAllEscaped;
551  } else {
552  Visit(E->getSubExpr());
553  }
554  }
555  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
556  if (!E)
557  return;
558  if (E->getCastKind() == CK_ArrayToPointerDecay) {
559  const bool SavedAllEscaped = AllEscaped;
560  AllEscaped = true;
561  Visit(E->getSubExpr());
562  AllEscaped = SavedAllEscaped;
563  } else {
564  Visit(E->getSubExpr());
565  }
566  }
567  void VisitExpr(const Expr *E) {
568  if (!E)
569  return;
570  bool SavedAllEscaped = AllEscaped;
571  if (!E->isLValue())
572  AllEscaped = false;
573  for (const Stmt *Child : E->children())
574  if (Child)
575  Visit(Child);
576  AllEscaped = SavedAllEscaped;
577  }
578  void VisitStmt(const Stmt *S) {
579  if (!S)
580  return;
581  for (const Stmt *Child : S->children())
582  if (Child)
583  Visit(Child);
584  }
585 
586  /// Returns the record that handles all the escaped local variables and used
587  /// instead of their original storage.
588  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
589  if (!GlobalizedRD)
590  buildRecordForGlobalizedVars(IsInTTDRegion);
591  return GlobalizedRD;
592  }
593 
594  /// Returns the field in the globalized record for the escaped variable.
595  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
596  assert(GlobalizedRD &&
597  "Record for globalized variables must be generated already.");
598  auto I = MappedDeclsFields.find(VD);
599  if (I == MappedDeclsFields.end())
600  return nullptr;
601  return I->getSecond();
602  }
603 
604  /// Returns the list of the escaped local variables/parameters.
605  ArrayRef<const ValueDecl *> getEscapedDecls() const {
606  return EscapedDecls.getArrayRef();
607  }
608 
609  /// Checks if the escaped local variable is actually a parameter passed by
610  /// value.
611  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
612  return EscapedParameters;
613  }
614 
615  /// Returns the list of the escaped variables with the variably modified
616  /// types.
617  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
618  return EscapedVariableLengthDecls.getArrayRef();
619  }
620 };
621 } // anonymous namespace
622 
623 /// Get the GPU warp size.
625  return CGF.EmitRuntimeCall(
626  llvm::Intrinsic::getDeclaration(
627  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
628  "nvptx_warp_size");
629 }
630 
631 /// Get the id of the current thread on the GPU.
633  return CGF.EmitRuntimeCall(
634  llvm::Intrinsic::getDeclaration(
635  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
636  "nvptx_tid");
637 }
638 
639 /// Get the id of the warp in the block.
640 /// We assume that the warp size is 32, which is always the case
641 /// on the NVPTX device, to generate more efficient code.
643  CGBuilderTy &Bld = CGF.Builder;
644  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
645 }
646 
647 /// Get the id of the current lane in the Warp.
648 /// We assume that the warp size is 32, which is always the case
649 /// on the NVPTX device, to generate more efficient code.
651  CGBuilderTy &Bld = CGF.Builder;
652  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
653  "nvptx_lane_id");
654 }
655 
656 /// Get the maximum number of threads in a block of the GPU.
658  return CGF.EmitRuntimeCall(
659  llvm::Intrinsic::getDeclaration(
660  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
661  "nvptx_num_threads");
662 }
663 
664 /// Get the value of the thread_limit clause in the teams directive.
665 /// For the 'generic' execution mode, the runtime encodes thread_limit in
666 /// the launch parameters, always starting thread_limit+warpSize threads per
667 /// CTA. The threads in the last warp are reserved for master execution.
668 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
670  bool IsInSPMDExecutionMode = false) {
671  CGBuilderTy &Bld = CGF.Builder;
672  return IsInSPMDExecutionMode
673  ? getNVPTXNumThreads(CGF)
674  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
675  "thread_limit");
676 }
677 
678 /// Get the thread id of the OMP master thread.
679 /// The master thread id is the first thread (lane) of the last warp in the
680 /// GPU block. Warp size is assumed to be some power of 2.
681 /// Thread id is 0 indexed.
682 /// E.g: If NumThreads is 33, master id is 32.
683 /// If NumThreads is 64, master id is 32.
684 /// If NumThreads is 1024, master id is 992.
686  CGBuilderTy &Bld = CGF.Builder;
687  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
688 
689  // We assume that the warp size is a power of 2.
690  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
691 
692  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
693  Bld.CreateNot(Mask), "master_tid");
694 }
695 
696 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
697  CodeGenModule &CGM, SourceLocation Loc)
698  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
699  Loc(Loc) {
700  createWorkerFunction(CGM);
701 }
702 
703 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
704  CodeGenModule &CGM) {
705  // Create an worker function with no arguments.
706 
707  WorkerFn = llvm::Function::Create(
709  /*placeholder=*/"_worker", &CGM.getModule());
710  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
711  WorkerFn->setDoesNotRecurse();
712 }
713 
715 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
716  return CurrentExecutionMode;
717 }
718 
721  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
723 }
724 
725 /// Check for inner (nested) SPMD construct, if any
727  const OMPExecutableDirective &D) {
728  const auto *CS = D.getInnermostCapturedStmt();
729  const auto *Body =
730  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
731  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
732 
733  if (const auto *NestedDir =
734  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
735  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
736  switch (D.getDirectiveKind()) {
737  case OMPD_target:
738  if (isOpenMPParallelDirective(DKind))
739  return true;
740  if (DKind == OMPD_teams) {
741  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
742  /*IgnoreCaptured=*/true);
743  if (!Body)
744  return false;
745  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
746  if (const auto *NND =
747  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
748  DKind = NND->getDirectiveKind();
749  if (isOpenMPParallelDirective(DKind))
750  return true;
751  }
752  }
753  return false;
754  case OMPD_target_teams:
755  return isOpenMPParallelDirective(DKind);
756  case OMPD_target_simd:
757  case OMPD_target_parallel:
758  case OMPD_target_parallel_for:
759  case OMPD_target_parallel_for_simd:
760  case OMPD_target_teams_distribute:
761  case OMPD_target_teams_distribute_simd:
762  case OMPD_target_teams_distribute_parallel_for:
763  case OMPD_target_teams_distribute_parallel_for_simd:
764  case OMPD_parallel:
765  case OMPD_for:
766  case OMPD_parallel_for:
767  case OMPD_parallel_master:
768  case OMPD_parallel_sections:
769  case OMPD_for_simd:
770  case OMPD_parallel_for_simd:
771  case OMPD_cancel:
772  case OMPD_cancellation_point:
773  case OMPD_ordered:
774  case OMPD_threadprivate:
775  case OMPD_allocate:
776  case OMPD_task:
777  case OMPD_simd:
778  case OMPD_sections:
779  case OMPD_section:
780  case OMPD_single:
781  case OMPD_master:
782  case OMPD_critical:
783  case OMPD_taskyield:
784  case OMPD_barrier:
785  case OMPD_taskwait:
786  case OMPD_taskgroup:
787  case OMPD_atomic:
788  case OMPD_flush:
789  case OMPD_teams:
790  case OMPD_target_data:
791  case OMPD_target_exit_data:
792  case OMPD_target_enter_data:
793  case OMPD_distribute:
794  case OMPD_distribute_simd:
795  case OMPD_distribute_parallel_for:
796  case OMPD_distribute_parallel_for_simd:
797  case OMPD_teams_distribute:
798  case OMPD_teams_distribute_simd:
799  case OMPD_teams_distribute_parallel_for:
800  case OMPD_teams_distribute_parallel_for_simd:
801  case OMPD_target_update:
802  case OMPD_declare_simd:
803  case OMPD_declare_variant:
804  case OMPD_declare_target:
805  case OMPD_end_declare_target:
806  case OMPD_declare_reduction:
807  case OMPD_declare_mapper:
808  case OMPD_taskloop:
809  case OMPD_taskloop_simd:
810  case OMPD_master_taskloop:
811  case OMPD_master_taskloop_simd:
812  case OMPD_parallel_master_taskloop:
813  case OMPD_parallel_master_taskloop_simd:
814  case OMPD_requires:
815  case OMPD_unknown:
816  llvm_unreachable("Unexpected directive.");
817  }
818  }
819 
820  return false;
821 }
822 
824  const OMPExecutableDirective &D) {
825  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
826  switch (DirectiveKind) {
827  case OMPD_target:
828  case OMPD_target_teams:
829  return hasNestedSPMDDirective(Ctx, D);
830  case OMPD_target_parallel:
831  case OMPD_target_parallel_for:
832  case OMPD_target_parallel_for_simd:
833  case OMPD_target_teams_distribute_parallel_for:
834  case OMPD_target_teams_distribute_parallel_for_simd:
835  case OMPD_target_simd:
836  case OMPD_target_teams_distribute_simd:
837  return true;
838  case OMPD_target_teams_distribute:
839  return false;
840  case OMPD_parallel:
841  case OMPD_for:
842  case OMPD_parallel_for:
843  case OMPD_parallel_master:
844  case OMPD_parallel_sections:
845  case OMPD_for_simd:
846  case OMPD_parallel_for_simd:
847  case OMPD_cancel:
848  case OMPD_cancellation_point:
849  case OMPD_ordered:
850  case OMPD_threadprivate:
851  case OMPD_allocate:
852  case OMPD_task:
853  case OMPD_simd:
854  case OMPD_sections:
855  case OMPD_section:
856  case OMPD_single:
857  case OMPD_master:
858  case OMPD_critical:
859  case OMPD_taskyield:
860  case OMPD_barrier:
861  case OMPD_taskwait:
862  case OMPD_taskgroup:
863  case OMPD_atomic:
864  case OMPD_flush:
865  case OMPD_teams:
866  case OMPD_target_data:
867  case OMPD_target_exit_data:
868  case OMPD_target_enter_data:
869  case OMPD_distribute:
870  case OMPD_distribute_simd:
871  case OMPD_distribute_parallel_for:
872  case OMPD_distribute_parallel_for_simd:
873  case OMPD_teams_distribute:
874  case OMPD_teams_distribute_simd:
875  case OMPD_teams_distribute_parallel_for:
876  case OMPD_teams_distribute_parallel_for_simd:
877  case OMPD_target_update:
878  case OMPD_declare_simd:
879  case OMPD_declare_variant:
880  case OMPD_declare_target:
881  case OMPD_end_declare_target:
882  case OMPD_declare_reduction:
883  case OMPD_declare_mapper:
884  case OMPD_taskloop:
885  case OMPD_taskloop_simd:
886  case OMPD_master_taskloop:
887  case OMPD_master_taskloop_simd:
888  case OMPD_parallel_master_taskloop:
889  case OMPD_parallel_master_taskloop_simd:
890  case OMPD_requires:
891  case OMPD_unknown:
892  break;
893  }
894  llvm_unreachable(
895  "Unknown programming model for OpenMP directive on NVPTX target.");
896 }
897 
898 /// Check if the directive is loops based and has schedule clause at all or has
899 /// static scheduling.
903  "Expected loop-based directive.");
904  return !D.hasClausesOfKind<OMPOrderedClause>() &&
906  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
907  [](const OMPScheduleClause *C) {
908  return C->getScheduleKind() == OMPC_SCHEDULE_static;
909  }));
910 }
911 
912 /// Check for inner (nested) lightweight runtime construct, if any
914  const OMPExecutableDirective &D) {
915  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
916  const auto *CS = D.getInnermostCapturedStmt();
917  const auto *Body =
918  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
919  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
920 
921  if (const auto *NestedDir =
922  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
923  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
924  switch (D.getDirectiveKind()) {
925  case OMPD_target:
926  if (isOpenMPParallelDirective(DKind) &&
928  hasStaticScheduling(*NestedDir))
929  return true;
930  if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
931  return true;
932  if (DKind == OMPD_parallel) {
933  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
934  /*IgnoreCaptured=*/true);
935  if (!Body)
936  return false;
937  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
938  if (const auto *NND =
939  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
940  DKind = NND->getDirectiveKind();
941  if (isOpenMPWorksharingDirective(DKind) &&
943  return true;
944  }
945  } else if (DKind == OMPD_teams) {
946  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
947  /*IgnoreCaptured=*/true);
948  if (!Body)
949  return false;
950  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
951  if (const auto *NND =
952  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
953  DKind = NND->getDirectiveKind();
954  if (isOpenMPParallelDirective(DKind) &&
957  return true;
958  if (DKind == OMPD_parallel) {
959  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
960  /*IgnoreCaptured=*/true);
961  if (!Body)
962  return false;
963  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
964  if (const auto *NND =
965  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
966  DKind = NND->getDirectiveKind();
967  if (isOpenMPWorksharingDirective(DKind) &&
969  return true;
970  }
971  }
972  }
973  }
974  return false;
975  case OMPD_target_teams:
976  if (isOpenMPParallelDirective(DKind) &&
978  hasStaticScheduling(*NestedDir))
979  return true;
980  if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
981  return true;
982  if (DKind == OMPD_parallel) {
983  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
984  /*IgnoreCaptured=*/true);
985  if (!Body)
986  return false;
987  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
988  if (const auto *NND =
989  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
990  DKind = NND->getDirectiveKind();
991  if (isOpenMPWorksharingDirective(DKind) &&
993  return true;
994  }
995  }
996  return false;
997  case OMPD_target_parallel:
998  if (DKind == OMPD_simd)
999  return true;
1000  return isOpenMPWorksharingDirective(DKind) &&
1001  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1002  case OMPD_target_teams_distribute:
1003  case OMPD_target_simd:
1004  case OMPD_target_parallel_for:
1005  case OMPD_target_parallel_for_simd:
1006  case OMPD_target_teams_distribute_simd:
1007  case OMPD_target_teams_distribute_parallel_for:
1008  case OMPD_target_teams_distribute_parallel_for_simd:
1009  case OMPD_parallel:
1010  case OMPD_for:
1011  case OMPD_parallel_for:
1012  case OMPD_parallel_master:
1013  case OMPD_parallel_sections:
1014  case OMPD_for_simd:
1015  case OMPD_parallel_for_simd:
1016  case OMPD_cancel:
1017  case OMPD_cancellation_point:
1018  case OMPD_ordered:
1019  case OMPD_threadprivate:
1020  case OMPD_allocate:
1021  case OMPD_task:
1022  case OMPD_simd:
1023  case OMPD_sections:
1024  case OMPD_section:
1025  case OMPD_single:
1026  case OMPD_master:
1027  case OMPD_critical:
1028  case OMPD_taskyield:
1029  case OMPD_barrier:
1030  case OMPD_taskwait:
1031  case OMPD_taskgroup:
1032  case OMPD_atomic:
1033  case OMPD_flush:
1034  case OMPD_teams:
1035  case OMPD_target_data:
1036  case OMPD_target_exit_data:
1037  case OMPD_target_enter_data:
1038  case OMPD_distribute:
1039  case OMPD_distribute_simd:
1040  case OMPD_distribute_parallel_for:
1041  case OMPD_distribute_parallel_for_simd:
1042  case OMPD_teams_distribute:
1043  case OMPD_teams_distribute_simd:
1044  case OMPD_teams_distribute_parallel_for:
1045  case OMPD_teams_distribute_parallel_for_simd:
1046  case OMPD_target_update:
1047  case OMPD_declare_simd:
1048  case OMPD_declare_variant:
1049  case OMPD_declare_target:
1050  case OMPD_end_declare_target:
1051  case OMPD_declare_reduction:
1052  case OMPD_declare_mapper:
1053  case OMPD_taskloop:
1054  case OMPD_taskloop_simd:
1055  case OMPD_master_taskloop:
1056  case OMPD_master_taskloop_simd:
1057  case OMPD_parallel_master_taskloop:
1058  case OMPD_parallel_master_taskloop_simd:
1059  case OMPD_requires:
1060  case OMPD_unknown:
1061  llvm_unreachable("Unexpected directive.");
1062  }
1063  }
1064 
1065  return false;
1066 }
1067 
1068 /// Checks if the construct supports lightweight runtime. It must be SPMD
1069 /// construct + inner loop-based construct with static scheduling.
1071  const OMPExecutableDirective &D) {
1072  if (!supportsSPMDExecutionMode(Ctx, D))
1073  return false;
1074  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1075  switch (DirectiveKind) {
1076  case OMPD_target:
1077  case OMPD_target_teams:
1078  case OMPD_target_parallel:
1079  return hasNestedLightweightDirective(Ctx, D);
1080  case OMPD_target_parallel_for:
1081  case OMPD_target_parallel_for_simd:
1082  case OMPD_target_teams_distribute_parallel_for:
1083  case OMPD_target_teams_distribute_parallel_for_simd:
1084  // (Last|First)-privates must be shared in parallel region.
1085  return hasStaticScheduling(D);
1086  case OMPD_target_simd:
1087  case OMPD_target_teams_distribute_simd:
1088  return true;
1089  case OMPD_target_teams_distribute:
1090  return false;
1091  case OMPD_parallel:
1092  case OMPD_for:
1093  case OMPD_parallel_for:
1094  case OMPD_parallel_master:
1095  case OMPD_parallel_sections:
1096  case OMPD_for_simd:
1097  case OMPD_parallel_for_simd:
1098  case OMPD_cancel:
1099  case OMPD_cancellation_point:
1100  case OMPD_ordered:
1101  case OMPD_threadprivate:
1102  case OMPD_allocate:
1103  case OMPD_task:
1104  case OMPD_simd:
1105  case OMPD_sections:
1106  case OMPD_section:
1107  case OMPD_single:
1108  case OMPD_master:
1109  case OMPD_critical:
1110  case OMPD_taskyield:
1111  case OMPD_barrier:
1112  case OMPD_taskwait:
1113  case OMPD_taskgroup:
1114  case OMPD_atomic:
1115  case OMPD_flush:
1116  case OMPD_teams:
1117  case OMPD_target_data:
1118  case OMPD_target_exit_data:
1119  case OMPD_target_enter_data:
1120  case OMPD_distribute:
1121  case OMPD_distribute_simd:
1122  case OMPD_distribute_parallel_for:
1123  case OMPD_distribute_parallel_for_simd:
1124  case OMPD_teams_distribute:
1125  case OMPD_teams_distribute_simd:
1126  case OMPD_teams_distribute_parallel_for:
1127  case OMPD_teams_distribute_parallel_for_simd:
1128  case OMPD_target_update:
1129  case OMPD_declare_simd:
1130  case OMPD_declare_variant:
1131  case OMPD_declare_target:
1132  case OMPD_end_declare_target:
1133  case OMPD_declare_reduction:
1134  case OMPD_declare_mapper:
1135  case OMPD_taskloop:
1136  case OMPD_taskloop_simd:
1137  case OMPD_master_taskloop:
1138  case OMPD_master_taskloop_simd:
1139  case OMPD_parallel_master_taskloop:
1140  case OMPD_parallel_master_taskloop_simd:
1141  case OMPD_requires:
1142  case OMPD_unknown:
1143  break;
1144  }
1145  llvm_unreachable(
1146  "Unknown programming model for OpenMP directive on NVPTX target.");
1147 }
1148 
1149 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1150  StringRef ParentName,
1151  llvm::Function *&OutlinedFn,
1152  llvm::Constant *&OutlinedFnID,
1153  bool IsOffloadEntry,
1154  const RegionCodeGenTy &CodeGen) {
1155  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1156  EntryFunctionState EST;
1157  WorkerFunctionState WST(CGM, D.getBeginLoc());
1158  Work.clear();
1159  WrapperFunctionsMap.clear();
1160 
1161  // Emit target region as a standalone region.
1162  class NVPTXPrePostActionTy : public PrePostActionTy {
1163  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1164  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1165 
1166  public:
1167  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1168  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1169  : EST(EST), WST(WST) {}
1170  void Enter(CodeGenFunction &CGF) override {
1171  auto &RT =
1172  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1173  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1174  // Skip target region initialization.
1175  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1176  }
1177  void Exit(CodeGenFunction &CGF) override {
1178  auto &RT =
1179  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1180  RT.clearLocThreadIdInsertPt(CGF);
1181  RT.emitNonSPMDEntryFooter(CGF, EST);
1182  }
1183  } Action(EST, WST);
1184  CodeGen.setAction(Action);
1185  IsInTTDRegion = true;
1186  // Reserve place for the globalized memory.
1187  GlobalizedRecords.emplace_back();
1188  if (!KernelStaticGlobalized) {
1189  KernelStaticGlobalized = new llvm::GlobalVariable(
1190  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1192  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1193  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1194  llvm::GlobalValue::NotThreadLocal,
1196  }
1197  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1198  IsOffloadEntry, CodeGen);
1199  IsInTTDRegion = false;
1200 
1201  // Now change the name of the worker function to correspond to this target
1202  // region's entry function.
1203  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1204 
1205  // Create the worker function
1206  emitWorkerFunction(WST);
1207 }
1208 
1209 // Setup NVPTX threads for master-worker OpenMP scheme.
1210 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1211  EntryFunctionState &EST,
1212  WorkerFunctionState &WST) {
1213  CGBuilderTy &Bld = CGF.Builder;
1214 
1215  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1216  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1217  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1218  EST.ExitBB = CGF.createBasicBlock(".exit");
1219 
1220  llvm::Value *IsWorker =
1221  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1222  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1223 
1224  CGF.EmitBlock(WorkerBB);
1225  emitCall(CGF, WST.Loc, WST.WorkerFn);
1226  CGF.EmitBranch(EST.ExitBB);
1227 
1228  CGF.EmitBlock(MasterCheckBB);
1229  llvm::Value *IsMaster =
1230  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1231  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1232 
1233  CGF.EmitBlock(MasterBB);
1234  IsInTargetMasterThreadRegion = true;
1235  // SEQUENTIAL (MASTER) REGION START
1236  // First action in sequential region:
1237  // Initialize the state of the OpenMP runtime library on the GPU.
1238  // TODO: Optimize runtime initialization and pass in correct value.
1239  llvm::Value *Args[] = {getThreadLimit(CGF),
1240  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1241  CGF.EmitRuntimeCall(
1242  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1243 
1244  // For data sharing, we need to initialize the stack.
1245  CGF.EmitRuntimeCall(
1246  createNVPTXRuntimeFunction(
1247  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1248 
1249  emitGenericVarsProlog(CGF, WST.Loc);
1250 }
1251 
1252 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1253  EntryFunctionState &EST) {
1254  IsInTargetMasterThreadRegion = false;
1255  if (!CGF.HaveInsertPoint())
1256  return;
1257 
1258  emitGenericVarsEpilog(CGF);
1259 
1260  if (!EST.ExitBB)
1261  EST.ExitBB = CGF.createBasicBlock(".exit");
1262 
1263  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1264  CGF.EmitBranch(TerminateBB);
1265 
1266  CGF.EmitBlock(TerminateBB);
1267  // Signal termination condition.
1268  // TODO: Optimize runtime initialization and pass in correct value.
1269  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1270  CGF.EmitRuntimeCall(
1271  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1272  // Barrier to terminate worker threads.
1273  syncCTAThreads(CGF);
1274  // Master thread jumps to exit point.
1275  CGF.EmitBranch(EST.ExitBB);
1276 
1277  CGF.EmitBlock(EST.ExitBB);
1278  EST.ExitBB = nullptr;
1279 }
1280 
1281 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1282  StringRef ParentName,
1283  llvm::Function *&OutlinedFn,
1284  llvm::Constant *&OutlinedFnID,
1285  bool IsOffloadEntry,
1286  const RegionCodeGenTy &CodeGen) {
1287  ExecutionRuntimeModesRAII ModeRAII(
1288  CurrentExecutionMode, RequiresFullRuntime,
1289  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1291  EntryFunctionState EST;
1292 
1293  // Emit target region as a standalone region.
1294  class NVPTXPrePostActionTy : public PrePostActionTy {
1296  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1297  const OMPExecutableDirective &D;
1298 
1299  public:
1300  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1301  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1302  const OMPExecutableDirective &D)
1303  : RT(RT), EST(EST), D(D) {}
1304  void Enter(CodeGenFunction &CGF) override {
1305  RT.emitSPMDEntryHeader(CGF, EST, D);
1306  // Skip target region initialization.
1307  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1308  }
1309  void Exit(CodeGenFunction &CGF) override {
1310  RT.clearLocThreadIdInsertPt(CGF);
1311  RT.emitSPMDEntryFooter(CGF, EST);
1312  }
1313  } Action(*this, EST, D);
1314  CodeGen.setAction(Action);
1315  IsInTTDRegion = true;
1316  // Reserve place for the globalized memory.
1317  GlobalizedRecords.emplace_back();
1318  if (!KernelStaticGlobalized) {
1319  KernelStaticGlobalized = new llvm::GlobalVariable(
1320  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1322  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1323  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1324  llvm::GlobalValue::NotThreadLocal,
1326  }
1327  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1328  IsOffloadEntry, CodeGen);
1329  IsInTTDRegion = false;
1330 }
1331 
1332 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1333  CodeGenFunction &CGF, EntryFunctionState &EST,
1334  const OMPExecutableDirective &D) {
1335  CGBuilderTy &Bld = CGF.Builder;
1336 
1337  // Setup BBs in entry function.
1338  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1339  EST.ExitBB = CGF.createBasicBlock(".exit");
1340 
1341  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1342  /*RequiresOMPRuntime=*/
1343  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1344  /*RequiresDataSharing=*/Bld.getInt16(0)};
1345  CGF.EmitRuntimeCall(
1346  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1347 
1348  if (RequiresFullRuntime) {
1349  // For data sharing, we need to initialize the stack.
1350  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1351  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1352  }
1353 
1354  CGF.EmitBranch(ExecuteBB);
1355 
1356  CGF.EmitBlock(ExecuteBB);
1357 
1358  IsInTargetMasterThreadRegion = true;
1359 }
1360 
1361 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1362  EntryFunctionState &EST) {
1363  IsInTargetMasterThreadRegion = false;
1364  if (!CGF.HaveInsertPoint())
1365  return;
1366 
1367  if (!EST.ExitBB)
1368  EST.ExitBB = CGF.createBasicBlock(".exit");
1369 
1370  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1371  CGF.EmitBranch(OMPDeInitBB);
1372 
1373  CGF.EmitBlock(OMPDeInitBB);
1374  // DeInitialize the OMP state in the runtime; called by all active threads.
1375  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1376  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1377  CGF.EmitRuntimeCall(
1378  createNVPTXRuntimeFunction(
1379  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1380  CGF.EmitBranch(EST.ExitBB);
1381 
1382  CGF.EmitBlock(EST.ExitBB);
1383  EST.ExitBB = nullptr;
1384 }
1385 
1386 // Create a unique global variable to indicate the execution mode of this target
1387 // region. The execution mode is either 'generic', or 'spmd' depending on the
1388 // target directive. This variable is picked up by the offload library to setup
1389 // the device appropriately before kernel launch. If the execution mode is
1390 // 'generic', the runtime reserves one warp for the master, otherwise, all
1391 // warps participate in parallel work.
1392 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1393  bool Mode) {
1394  auto *GVMode =
1395  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1396  llvm::GlobalValue::WeakAnyLinkage,
1397  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1398  Twine(Name, "_exec_mode"));
1399  CGM.addCompilerUsedGlobal(GVMode);
1400 }
1401 
1402 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1403  ASTContext &Ctx = CGM.getContext();
1404 
1405  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1406  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1407  WST.Loc, WST.Loc);
1408  emitWorkerLoop(CGF, WST);
1409  CGF.FinishFunction();
1410 }
1411 
1412 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1413  WorkerFunctionState &WST) {
1414  //
1415  // The workers enter this loop and wait for parallel work from the master.
1416  // When the master encounters a parallel region it sets up the work + variable
1417  // arguments, and wakes up the workers. The workers first check to see if
1418  // they are required for the parallel region, i.e., within the # of requested
1419  // parallel threads. The activated workers load the variable arguments and
1420  // execute the parallel work.
1421  //
1422 
1423  CGBuilderTy &Bld = CGF.Builder;
1424 
1425  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1426  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1427  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1428  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1429  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1430  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1431 
1432  CGF.EmitBranch(AwaitBB);
1433 
1434  // Workers wait for work from master.
1435  CGF.EmitBlock(AwaitBB);
1436  // Wait for parallel work
1437  syncCTAThreads(CGF);
1438 
1439  Address WorkFn =
1440  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1441  Address ExecStatus =
1442  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1443  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1444  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1445 
1446  // TODO: Optimize runtime initialization and pass in correct value.
1447  llvm::Value *Args[] = {WorkFn.getPointer(),
1448  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1450  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1451  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1452 
1453  // On termination condition (workid == 0), exit loop.
1454  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1455  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1456  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1457 
1458  // Activate requested workers.
1459  CGF.EmitBlock(SelectWorkersBB);
1460  llvm::Value *IsActive =
1461  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1462  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1463 
1464  // Signal start of parallel region.
1465  CGF.EmitBlock(ExecuteBB);
1466  // Skip initialization.
1467  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1468 
1469  // Process work items: outlined parallel functions.
1470  for (llvm::Function *W : Work) {
1471  // Try to match this outlined function.
1473 
1474  llvm::Value *WorkFnMatch =
1475  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1476 
1477  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1478  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1479  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1480 
1481  // Execute this outlined function.
1482  CGF.EmitBlock(ExecuteFNBB);
1483 
1484  // Insert call to work function via shared wrapper. The shared
1485  // wrapper takes two arguments:
1486  // - the parallelism level;
1487  // - the thread ID;
1488  emitCall(CGF, WST.Loc, W,
1489  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1490 
1491  // Go to end of parallel region.
1492  CGF.EmitBranch(TerminateBB);
1493 
1494  CGF.EmitBlock(CheckNextBB);
1495  }
1496  // Default case: call to outlined function through pointer if the target
1497  // region makes a declare target call that may contain an orphaned parallel
1498  // directive.
1499  auto *ParallelFnTy =
1500  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1501  /*isVarArg=*/false);
1502  llvm::Value *WorkFnCast =
1503  Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1504  // Insert call to work function via shared wrapper. The shared
1505  // wrapper takes two arguments:
1506  // - the parallelism level;
1507  // - the thread ID;
1508  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1509  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1510  // Go to end of parallel region.
1511  CGF.EmitBranch(TerminateBB);
1512 
1513  // Signal end of parallel region.
1514  CGF.EmitBlock(TerminateBB);
1515  CGF.EmitRuntimeCall(
1516  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1517  llvm::None);
1518  CGF.EmitBranch(BarrierBB);
1519 
1520  // All active and inactive workers wait at a barrier after parallel region.
1521  CGF.EmitBlock(BarrierBB);
1522  // Barrier after parallel region.
1523  syncCTAThreads(CGF);
1524  CGF.EmitBranch(AwaitBB);
1525 
1526  // Exit target region.
1527  CGF.EmitBlock(ExitBB);
1528  // Skip initialization.
1529  clearLocThreadIdInsertPt(CGF);
1530 }
1531 
1532 /// Returns specified OpenMP runtime function for the current OpenMP
1533 /// implementation. Specialized for the NVPTX device.
1534 /// \param Function OpenMP runtime function.
1535 /// \return Specified function.
1536 llvm::FunctionCallee
1538  llvm::FunctionCallee RTLFn = nullptr;
1539  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1540  case OMPRTL_NVPTX__kmpc_kernel_init: {
1541  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1542  // RequiresOMPRuntime);
1543  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1544  auto *FnTy =
1545  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1546  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1547  break;
1548  }
1549  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1550  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1551  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1552  auto *FnTy =
1553  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1554  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1555  break;
1556  }
1557  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1558  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1559  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1560  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1561  auto *FnTy =
1562  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1563  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1564  break;
1565  }
1566  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1567  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1568  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1569  auto *FnTy =
1570  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1571  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1572  break;
1573  }
1574  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1575  /// Build void __kmpc_kernel_prepare_parallel(
1576  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1577  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1578  auto *FnTy =
1579  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1580  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1581  break;
1582  }
1583  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1584  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1585  /// int16_t IsOMPRuntimeInitialized);
1586  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1587  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1588  auto *FnTy =
1589  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1590  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1591  break;
1592  }
1593  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1594  /// Build void __kmpc_kernel_end_parallel();
1595  auto *FnTy =
1596  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1597  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1598  break;
1599  }
1600  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1601  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1602  // global_tid);
1603  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1604  auto *FnTy =
1605  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1606  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1607  break;
1608  }
1609  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1610  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1611  // global_tid);
1612  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1613  auto *FnTy =
1614  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1615  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1616  break;
1617  }
1618  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1619  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1620  // int16_t lane_offset, int16_t warp_size);
1621  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1622  auto *FnTy =
1623  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1624  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1625  break;
1626  }
1627  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1628  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1629  // int16_t lane_offset, int16_t warp_size);
1630  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1631  auto *FnTy =
1632  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1633  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1634  break;
1635  }
1636  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1637  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1638  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1639  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1640  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1641  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1642  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1643  CGM.Int16Ty, CGM.Int16Ty};
1644  auto *ShuffleReduceFnTy =
1645  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1646  /*isVarArg=*/false);
1647  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1648  auto *InterWarpCopyFnTy =
1649  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1650  /*isVarArg=*/false);
1651  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1652  CGM.Int32Ty,
1653  CGM.Int32Ty,
1654  CGM.SizeTy,
1655  CGM.VoidPtrTy,
1656  ShuffleReduceFnTy->getPointerTo(),
1657  InterWarpCopyFnTy->getPointerTo()};
1658  auto *FnTy =
1659  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1660  RTLFn = CGM.CreateRuntimeFunction(
1661  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1662  break;
1663  }
1664  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1665  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1666  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1667  auto *FnTy =
1668  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1669  RTLFn = CGM.CreateRuntimeFunction(
1670  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1671  break;
1672  }
1673  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1674  // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1675  // global_tid, void *global_buffer, int32_t num_of_records, void*
1676  // reduce_data,
1677  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1678  // lane_offset, int16_t shortCircuit),
1679  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1680  // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1681  // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1682  // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1683  // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1684  // *buffer, int idx, void *reduce_data));
1685  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1686  CGM.Int16Ty, CGM.Int16Ty};
1687  auto *ShuffleReduceFnTy =
1688  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1689  /*isVarArg=*/false);
1690  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1691  auto *InterWarpCopyFnTy =
1692  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1693  /*isVarArg=*/false);
1694  llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1695  CGM.VoidPtrTy};
1696  auto *GlobalListFnTy =
1697  llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1698  /*isVarArg=*/false);
1699  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1700  CGM.Int32Ty,
1701  CGM.VoidPtrTy,
1702  CGM.Int32Ty,
1703  CGM.VoidPtrTy,
1704  ShuffleReduceFnTy->getPointerTo(),
1705  InterWarpCopyFnTy->getPointerTo(),
1706  GlobalListFnTy->getPointerTo(),
1707  GlobalListFnTy->getPointerTo(),
1708  GlobalListFnTy->getPointerTo(),
1709  GlobalListFnTy->getPointerTo()};
1710  auto *FnTy =
1711  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1712  RTLFn = CGM.CreateRuntimeFunction(
1713  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1714  break;
1715  }
1716  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1717  /// Build void __kmpc_data_sharing_init_stack();
1718  auto *FnTy =
1719  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1720  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1721  break;
1722  }
1723  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1724  /// Build void __kmpc_data_sharing_init_stack_spmd();
1725  auto *FnTy =
1726  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1727  RTLFn =
1728  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1729  break;
1730  }
1731  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1732  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1733  // int16_t UseSharedMemory);
1734  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1735  auto *FnTy =
1736  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1737  RTLFn = CGM.CreateRuntimeFunction(
1738  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1739  break;
1740  }
1741  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1742  // Build void __kmpc_data_sharing_pop_stack(void *a);
1743  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1744  auto *FnTy =
1745  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1746  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1747  /*Name=*/"__kmpc_data_sharing_pop_stack");
1748  break;
1749  }
1750  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1751  /// Build void __kmpc_begin_sharing_variables(void ***args,
1752  /// size_t n_args);
1753  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1754  auto *FnTy =
1755  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1756  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1757  break;
1758  }
1759  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1760  /// Build void __kmpc_end_sharing_variables();
1761  auto *FnTy =
1762  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1763  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1764  break;
1765  }
1766  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1767  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1768  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1769  auto *FnTy =
1770  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1771  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1772  break;
1773  }
1774  case OMPRTL_NVPTX__kmpc_parallel_level: {
1775  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1776  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1777  auto *FnTy =
1778  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1779  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1780  break;
1781  }
1782  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1783  // Build int8_t __kmpc_is_spmd_exec_mode();
1784  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1785  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1786  break;
1787  }
1788  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1789  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1790  // const void *buf, size_t size, int16_t is_shared, const void **res);
1791  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1792  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1793  auto *FnTy =
1794  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1795  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1796  break;
1797  }
1798  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1799  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1800  // int16_t is_shared);
1801  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1802  auto *FnTy =
1803  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1804  RTLFn =
1805  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1806  break;
1807  }
1808  case OMPRTL__kmpc_barrier: {
1809  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1810  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1811  auto *FnTy =
1812  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1813  RTLFn =
1814  CGM.CreateConvergentRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1815  break;
1816  }
1817  case OMPRTL__kmpc_barrier_simple_spmd: {
1818  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1819  // global_tid);
1820  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1821  auto *FnTy =
1822  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1823  RTLFn = CGM.CreateConvergentRuntimeFunction(
1824  FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1825  break;
1826  }
1827  case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: {
1828  // Build int32_t __kmpc_warp_active_thread_mask(void);
1829  auto *FnTy =
1830  llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false);
1831  RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask");
1832  break;
1833  }
1834  case OMPRTL_NVPTX__kmpc_syncwarp: {
1835  // Build void __kmpc_syncwarp(kmp_int32 Mask);
1836  auto *FnTy =
1837  llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false);
1838  RTLFn = CGM.CreateConvergentRuntimeFunction(FnTy, "__kmpc_syncwarp");
1839  break;
1840  }
1841  }
1842  return RTLFn;
1843 }
1844 
1845 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1846  llvm::Constant *Addr,
1847  uint64_t Size, int32_t,
1848  llvm::GlobalValue::LinkageTypes) {
1849  // TODO: Add support for global variables on the device after declare target
1850  // support.
1851  if (!isa<llvm::Function>(Addr))
1852  return;
1853  llvm::Module &M = CGM.getModule();
1854  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1855 
1856  // Get "nvvm.annotations" metadata node
1857  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1858 
1859  llvm::Metadata *MDVals[] = {
1860  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1861  llvm::ConstantAsMetadata::get(
1862  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1863  // Append metadata to nvvm.annotations
1864  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1865 }
1866 
1867 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1868  const OMPExecutableDirective &D, StringRef ParentName,
1869  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1870  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1871  if (!IsOffloadEntry) // Nothing to do.
1872  return;
1873 
1874  assert(!ParentName.empty() && "Invalid target region parent name!");
1875 
1876  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1877  if (Mode)
1878  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1879  CodeGen);
1880  else
1881  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1882  CodeGen);
1883 
1884  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1885 }
1886 
1887 namespace {
1889 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1890 enum ModeFlagsTy : unsigned {
1891  /// Bit set to 1 when in SPMD mode.
1892  KMP_IDENT_SPMD_MODE = 0x01,
1893  /// Bit set to 1 when a simplified runtime is used.
1894  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1895  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1896 };
1897 
1898 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1899 static const ModeFlagsTy UndefinedMode =
1900  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1901 } // anonymous namespace
1902 
1904  switch (getExecutionMode()) {
1905  case EM_SPMD:
1906  if (requiresFullRuntime())
1907  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1908  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1909  case EM_NonSPMD:
1910  assert(requiresFullRuntime() && "Expected full runtime.");
1911  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1912  case EM_Unknown:
1913  return UndefinedMode;
1914  }
1915  llvm_unreachable("Unknown flags are requested.");
1916 }
1917 
1919  const GlobalDecl &OldGD,
1920  llvm::GlobalValue *OrigAddr,
1921  bool IsForDefinition) {
1922  // Emit the function in OldGD with the body from NewGD, if NewGD is defined.
1923  auto *NewFD = cast<FunctionDecl>(NewGD.getDecl());
1924  if (NewFD->isDefined()) {
1925  CGM.emitOpenMPDeviceFunctionRedefinition(OldGD, NewGD, OrigAddr);
1926  return true;
1927  }
1928  return false;
1929 }
1930 
1932  : CGOpenMPRuntime(CGM, "_", "$") {
1933  if (!CGM.getLangOpts().OpenMPIsDevice)
1934  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1935 }
1936 
1938  ProcBindKind ProcBind,
1939  SourceLocation Loc) {
1940  // Do nothing in case of SPMD mode and L0 parallel.
1941  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1942  return;
1943 
1944  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1945 }
1946 
1948  llvm::Value *NumThreads,
1949  SourceLocation Loc) {
1950  // Do nothing in case of SPMD mode and L0 parallel.
1951  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1952  return;
1953 
1954  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1955 }
1956 
1958  const Expr *NumTeams,
1959  const Expr *ThreadLimit,
1960  SourceLocation Loc) {}
1961 
1963  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1964  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1965  // Emit target region as a standalone region.
1966  class NVPTXPrePostActionTy : public PrePostActionTy {
1967  bool &IsInParallelRegion;
1968  bool PrevIsInParallelRegion;
1969 
1970  public:
1971  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1972  : IsInParallelRegion(IsInParallelRegion) {}
1973  void Enter(CodeGenFunction &CGF) override {
1974  PrevIsInParallelRegion = IsInParallelRegion;
1975  IsInParallelRegion = true;
1976  }
1977  void Exit(CodeGenFunction &CGF) override {
1978  IsInParallelRegion = PrevIsInParallelRegion;
1979  }
1980  } Action(IsInParallelRegion);
1981  CodeGen.setAction(Action);
1982  bool PrevIsInTTDRegion = IsInTTDRegion;
1983  IsInTTDRegion = false;
1984  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1985  IsInTargetMasterThreadRegion = false;
1986  auto *OutlinedFun =
1987  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1988  D, ThreadIDVar, InnermostKind, CodeGen));
1989  if (CGM.getLangOpts().Optimize) {
1990  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1991  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1992  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1993  }
1994  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1995  IsInTTDRegion = PrevIsInTTDRegion;
1996  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1997  !IsInParallelRegion) {
1998  llvm::Function *WrapperFun =
1999  createParallelDataSharingWrapper(OutlinedFun, D);
2000  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
2001  }
2002 
2003  return OutlinedFun;
2004 }
2005 
2006 /// Get list of lastprivate variables from the teams distribute ... or
2007 /// teams {distribute ...} directives.
2008 static void
2012  "expected teams directive.");
2013  const OMPExecutableDirective *Dir = &D;
2016  Ctx,
2018  /*IgnoreCaptured=*/true))) {
2019  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
2020  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2021  Dir = nullptr;
2022  }
2023  }
2024  if (!Dir)
2025  return;
2026  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2027  for (const Expr *E : C->getVarRefs())
2028  Vars.push_back(getPrivateItem(E));
2029  }
2030 }
2031 
2032 /// Get list of reduction variables from the teams ... directives.
2033 static void
2037  "expected teams directive.");
2038  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2039  for (const Expr *E : C->privates())
2040  Vars.push_back(getPrivateItem(E));
2041  }
2042 }
2043 
2045  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2046  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2047  SourceLocation Loc = D.getBeginLoc();
2048 
2049  const RecordDecl *GlobalizedRD = nullptr;
2050  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2051  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2052  // Globalize team reductions variable unconditionally in all modes.
2053  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2054  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2055  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2056  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2057  if (!LastPrivatesReductions.empty()) {
2058  GlobalizedRD = ::buildRecordForGlobalizedVars(
2059  CGM.getContext(), llvm::None, LastPrivatesReductions,
2060  MappedDeclsFields, WarpSize);
2061  }
2062  } else if (!LastPrivatesReductions.empty()) {
2063  assert(!TeamAndReductions.first &&
2064  "Previous team declaration is not expected.");
2065  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2066  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2067  }
2068 
2069  // Emit target region as a standalone region.
2070  class NVPTXPrePostActionTy : public PrePostActionTy {
2071  SourceLocation &Loc;
2072  const RecordDecl *GlobalizedRD;
2073  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2074  &MappedDeclsFields;
2075 
2076  public:
2077  NVPTXPrePostActionTy(
2078  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2079  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2080  &MappedDeclsFields)
2081  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2082  MappedDeclsFields(MappedDeclsFields) {}
2083  void Enter(CodeGenFunction &CGF) override {
2084  auto &Rt =
2085  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2086  if (GlobalizedRD) {
2087  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2088  I->getSecond().GlobalRecord = GlobalizedRD;
2089  I->getSecond().MappedParams =
2090  std::make_unique<CodeGenFunction::OMPMapVars>();
2091  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2092  for (const auto &Pair : MappedDeclsFields) {
2093  assert(Pair.getFirst()->isCanonicalDecl() &&
2094  "Expected canonical declaration");
2095  Data.insert(std::make_pair(Pair.getFirst(),
2096  MappedVarData(Pair.getSecond(),
2097  /*IsOnePerTeam=*/true)));
2098  }
2099  }
2100  Rt.emitGenericVarsProlog(CGF, Loc);
2101  }
2102  void Exit(CodeGenFunction &CGF) override {
2103  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2104  .emitGenericVarsEpilog(CGF);
2105  }
2106  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2107  CodeGen.setAction(Action);
2108  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2109  D, ThreadIDVar, InnermostKind, CodeGen);
2110  if (CGM.getLangOpts().Optimize) {
2111  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2112  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2113  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2114  }
2115 
2116  return OutlinedFun;
2117 }
2118 
2119 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2120  SourceLocation Loc,
2121  bool WithSPMDCheck) {
2123  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2124  return;
2125 
2126  CGBuilderTy &Bld = CGF.Builder;
2127 
2128  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2129  if (I == FunctionGlobalizedDecls.end())
2130  return;
2131  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2132  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2133  QualType SecGlobalRecTy;
2134 
2135  // Recover pointer to this function's global record. The runtime will
2136  // handle the specifics of the allocation of the memory.
2137  // Use actual memory size of the record including the padding
2138  // for alignment purposes.
2139  unsigned Alignment =
2140  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2141  unsigned GlobalRecordSize =
2142  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2143  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2144 
2145  llvm::PointerType *GlobalRecPtrTy =
2146  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2147  llvm::Value *GlobalRecCastAddr;
2148  llvm::Value *IsTTD = nullptr;
2149  if (!IsInTTDRegion &&
2150  (WithSPMDCheck ||
2151  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2152  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2153  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2154  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2155  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2156  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2157  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2158  llvm::Value *PL = CGF.EmitRuntimeCall(
2159  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2160  {RTLoc, ThreadID});
2161  IsTTD = Bld.CreateIsNull(PL);
2162  }
2163  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2164  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2165  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2166  // There is no need to emit line number for unconditional branch.
2168  CGF.EmitBlock(SPMDBB);
2169  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2170  CharUnits::fromQuantity(Alignment));
2171  CGF.EmitBranch(ExitBB);
2172  // There is no need to emit line number for unconditional branch.
2174  CGF.EmitBlock(NonSPMDBB);
2175  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2176  if (const RecordDecl *SecGlobalizedVarsRecord =
2177  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2178  SecGlobalRecTy =
2179  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2180 
2181  // Recover pointer to this function's global record. The runtime will
2182  // handle the specifics of the allocation of the memory.
2183  // Use actual memory size of the record including the padding
2184  // for alignment purposes.
2185  unsigned Alignment =
2186  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2187  unsigned GlobalRecordSize =
2188  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2189  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2190  Size = Bld.CreateSelect(
2191  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2192  }
2193  // TODO: allow the usage of shared memory to be controlled by
2194  // the user, for now, default to global.
2195  llvm::Value *GlobalRecordSizeArg[] = {
2196  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2197  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2199  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2200  GlobalRecordSizeArg);
2201  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2202  GlobalRecValue, GlobalRecPtrTy);
2203  CGF.EmitBlock(ExitBB);
2204  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2205  /*NumReservedValues=*/2, "_select_stack");
2206  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2207  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2208  GlobalRecCastAddr = Phi;
2209  I->getSecond().GlobalRecordAddr = Phi;
2210  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2211  } else if (IsInTTDRegion) {
2212  assert(GlobalizedRecords.back().Records.size() < 2 &&
2213  "Expected less than 2 globalized records: one for target and one "
2214  "for teams.");
2215  unsigned Offset = 0;
2216  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2217  QualType RDTy = CGM.getContext().getRecordType(RD);
2218  unsigned Alignment =
2220  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2221  Offset =
2222  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2223  }
2224  unsigned Alignment =
2225  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2226  Offset = llvm::alignTo(Offset, Alignment);
2227  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2228  ++GlobalizedRecords.back().RegionCounter;
2229  if (GlobalizedRecords.back().Records.size() == 1) {
2230  assert(KernelStaticGlobalized &&
2231  "Kernel static pointer must be initialized already.");
2232  auto *UseSharedMemory = new llvm::GlobalVariable(
2233  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2235  "_openmp_static_kernel$is_shared");
2236  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2237  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2238  /*DestWidth=*/16, /*Signed=*/0);
2239  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2240  Address(UseSharedMemory,
2241  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2242  /*Volatile=*/false, Int16Ty, Loc);
2243  auto *StaticGlobalized = new llvm::GlobalVariable(
2244  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2245  llvm::GlobalValue::CommonLinkage, nullptr);
2246  auto *RecSize = new llvm::GlobalVariable(
2247  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2249  "_openmp_static_kernel$size");
2250  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2251  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2252  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2253  CGM.getContext().getSizeType(), Loc);
2255  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2256  llvm::Value *GlobalRecordSizeArg[] = {
2257  llvm::ConstantInt::get(
2258  CGM.Int16Ty,
2259  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2260  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2262  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2263  GlobalRecordSizeArg);
2264  GlobalizedRecords.back().Buffer = StaticGlobalized;
2265  GlobalizedRecords.back().RecSize = RecSize;
2266  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2267  GlobalizedRecords.back().Loc = Loc;
2268  }
2269  assert(KernelStaticGlobalized && "Global address must be set already.");
2270  Address FrameAddr = CGF.EmitLoadOfPointer(
2271  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2272  CGM.getContext()
2274  .castAs<PointerType>());
2275  llvm::Value *GlobalRecValue =
2276  Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2277  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2278  I->getSecond().IsInSPMDModeFlag = nullptr;
2279  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2280  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2281  } else {
2282  // TODO: allow the usage of shared memory to be controlled by
2283  // the user, for now, default to global.
2284  llvm::Value *GlobalRecordSizeArg[] = {
2285  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2286  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2287  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2289  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2290  GlobalRecordSizeArg);
2291  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2292  GlobalRecValue, GlobalRecPtrTy);
2293  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2294  I->getSecond().IsInSPMDModeFlag = nullptr;
2295  }
2296  LValue Base =
2297  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2298 
2299  // Emit the "global alloca" which is a GEP from the global declaration
2300  // record using the pointer returned by the runtime.
2301  LValue SecBase;
2302  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2303  if (IsTTD) {
2304  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2305  llvm::PointerType *SecGlobalRecPtrTy =
2306  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2307  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2309  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2310  SecGlobalRecTy);
2311  }
2312  for (auto &Rec : I->getSecond().LocalVarData) {
2313  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2314  llvm::Value *ParValue;
2315  if (EscapedParam) {
2316  const auto *VD = cast<VarDecl>(Rec.first);
2317  LValue ParLVal =
2318  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2319  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2320  }
2321  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2322  // Emit VarAddr basing on lane-id if required.
2323  QualType VarTy;
2324  if (Rec.second.IsOnePerTeam) {
2325  VarTy = Rec.second.FD->getType();
2326  } else {
2327  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2328  VarAddr.getAddress(CGF).getPointer(),
2329  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2330  VarTy =
2331  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2332  VarAddr = CGF.MakeAddrLValue(
2333  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2335  }
2336  Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
2337  if (!IsInTTDRegion &&
2338  (WithSPMDCheck ||
2339  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2340  assert(I->getSecond().IsInSPMDModeFlag &&
2341  "Expected unknown execution mode or required SPMD check.");
2342  if (IsTTD) {
2343  assert(SecIt->second.IsOnePerTeam &&
2344  "Secondary glob data must be one per team.");
2345  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2346  VarAddr.setAddress(
2347  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
2348  VarAddr.getPointer(CGF)),
2349  VarAddr.getAlignment()));
2350  Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
2351  }
2352  Address GlobalPtr = Rec.second.PrivateAddr;
2353  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2354  Rec.second.PrivateAddr = Address(
2355  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2356  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2357  LocalAddr.getAlignment());
2358  }
2359  if (EscapedParam) {
2360  const auto *VD = cast<VarDecl>(Rec.first);
2361  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2362  I->getSecond().MappedParams->setVarAddr(CGF, VD,
2363  VarAddr.getAddress(CGF));
2364  }
2365  if (IsTTD)
2366  ++SecIt;
2367  }
2368  }
2369  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2370  // Recover pointer to this function's global record. The runtime will
2371  // handle the specifics of the allocation of the memory.
2372  // Use actual memory size of the record including the padding
2373  // for alignment purposes.
2374  CGBuilderTy &Bld = CGF.Builder;
2375  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2376  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2377  Size = Bld.CreateNUWAdd(
2378  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2379  llvm::Value *AlignVal =
2380  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2381  Size = Bld.CreateUDiv(Size, AlignVal);
2382  Size = Bld.CreateNUWMul(Size, AlignVal);
2383  // TODO: allow the usage of shared memory to be controlled by
2384  // the user, for now, default to global.
2385  llvm::Value *GlobalRecordSizeArg[] = {
2386  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2387  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2389  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2390  GlobalRecordSizeArg);
2391  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2392  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2393  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2394  CGM.getContext().getDeclAlign(VD),
2396  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2397  Base.getAddress(CGF));
2398  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2399  }
2400  I->getSecond().MappedParams->apply(CGF);
2401 }
2402 
2403 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2404  bool WithSPMDCheck) {
2406  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2407  return;
2408 
2409  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2410  if (I != FunctionGlobalizedDecls.end()) {
2411  I->getSecond().MappedParams->restore(CGF);
2412  if (!CGF.HaveInsertPoint())
2413  return;
2414  for (llvm::Value *Addr :
2415  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2416  CGF.EmitRuntimeCall(
2417  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2418  Addr);
2419  }
2420  if (I->getSecond().GlobalRecordAddr) {
2421  if (!IsInTTDRegion &&
2422  (WithSPMDCheck ||
2423  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2424  CGBuilderTy &Bld = CGF.Builder;
2425  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2426  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2427  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2428  // There is no need to emit line number for unconditional branch.
2430  CGF.EmitBlock(NonSPMDBB);
2431  CGF.EmitRuntimeCall(
2433  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2434  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2435  CGF.EmitBlock(ExitBB);
2436  } else if (IsInTTDRegion) {
2437  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2438  "region counter must be > 0.");
2439  --GlobalizedRecords.back().RegionCounter;
2440  // Emit the restore function only in the target region.
2441  if (GlobalizedRecords.back().RegionCounter == 0) {
2442  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2443  /*DestWidth=*/16, /*Signed=*/0);
2444  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2445  Address(GlobalizedRecords.back().UseSharedMemory,
2446  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2447  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2448  llvm::Value *Args[] = {
2449  llvm::ConstantInt::get(
2450  CGM.Int16Ty,
2451  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2452  IsInSharedMemory};
2453  CGF.EmitRuntimeCall(
2455  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2456  Args);
2457  }
2458  } else {
2460  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2461  I->getSecond().GlobalRecordAddr);
2462  }
2463  }
2464  }
2465 }
2466 
2468  const OMPExecutableDirective &D,
2469  SourceLocation Loc,
2470  llvm::Function *OutlinedFn,
2471  ArrayRef<llvm::Value *> CapturedVars) {
2472  if (!CGF.HaveInsertPoint())
2473  return;
2474 
2475  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2476  /*Name=*/".zero.addr");
2477  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2478  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2479  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2480  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2481  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2482  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2483 }
2484 
2486  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2487  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2488  if (!CGF.HaveInsertPoint())
2489  return;
2490 
2491  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2492  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2493  else
2494  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2495 }
2496 
2497 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2498  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2499  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2500  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2501 
2502  // Force inline this outlined function at its call site.
2503  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2504 
2505  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2506  /*Name=*/".zero.addr");
2507  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2508  // ThreadId for serialized parallels is 0.
2509  Address ThreadIDAddr = ZeroAddr;
2510  auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr](
2511  CodeGenFunction &CGF, PrePostActionTy &Action) {
2512  Action.Enter(CGF);
2513 
2514  Address ZeroAddr =
2515  CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2516  /*Name=*/".bound.zero.addr");
2517  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2518  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2519  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2520  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2521  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2522  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2523  };
2524  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2525  PrePostActionTy &) {
2526 
2527  RegionCodeGenTy RCG(CodeGen);
2528  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2529  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2530  llvm::Value *Args[] = {RTLoc, ThreadID};
2531 
2532  NVPTXActionTy Action(
2533  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2534  Args,
2535  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2536  Args);
2537  RCG.setAction(Action);
2538  RCG(CGF);
2539  };
2540 
2541  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2542  PrePostActionTy &Action) {
2543  CGBuilderTy &Bld = CGF.Builder;
2544  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2545  assert(WFn && "Wrapper function does not exist!");
2546  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2547 
2548  // Prepare for parallel region. Indicate the outlined function.
2549  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2550  CGF.EmitRuntimeCall(
2551  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2552  Args);
2553 
2554  // Create a private scope that will globalize the arguments
2555  // passed from the outside of the target region.
2556  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2557 
2558  // There's something to share.
2559  if (!CapturedVars.empty()) {
2560  // Prepare for parallel region. Indicate the outlined function.
2561  Address SharedArgs =
2562  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2563  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2564 
2565  llvm::Value *DataSharingArgs[] = {
2566  SharedArgsPtr,
2567  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2568  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2569  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2570  DataSharingArgs);
2571 
2572  // Store variable address in a list of references to pass to workers.
2573  unsigned Idx = 0;
2574  ASTContext &Ctx = CGF.getContext();
2575  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2576  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2577  .castAs<PointerType>());
2578  for (llvm::Value *V : CapturedVars) {
2579  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2580  llvm::Value *PtrV;
2581  if (V->getType()->isIntegerTy())
2582  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2583  else
2584  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2585  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2586  Ctx.getPointerType(Ctx.VoidPtrTy));
2587  ++Idx;
2588  }
2589  }
2590 
2591  // Activate workers. This barrier is used by the master to signal
2592  // work for the workers.
2593  syncCTAThreads(CGF);
2594 
2595  // OpenMP [2.5, Parallel Construct, p.49]
2596  // There is an implied barrier at the end of a parallel region. After the
2597  // end of a parallel region, only the master thread of the team resumes
2598  // execution of the enclosing task region.
2599  //
2600  // The master waits at this barrier until all workers are done.
2601  syncCTAThreads(CGF);
2602 
2603  if (!CapturedVars.empty())
2604  CGF.EmitRuntimeCall(
2605  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2606 
2607  // Remember for post-processing in worker loop.
2608  Work.emplace_back(WFn);
2609  };
2610 
2611  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2612  CodeGenFunction &CGF, PrePostActionTy &Action) {
2613  if (IsInParallelRegion) {
2614  SeqGen(CGF, Action);
2615  } else if (IsInTargetMasterThreadRegion) {
2616  L0ParallelGen(CGF, Action);
2617  } else {
2618  // Check for master and then parallelism:
2619  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2620  // Serialized execution.
2621  // } else {
2622  // Worker call.
2623  // }
2624  CGBuilderTy &Bld = CGF.Builder;
2625  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2626  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2627  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2628  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2629  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2630  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2631  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2632  // There is no need to emit line number for unconditional branch.
2634  CGF.EmitBlock(ParallelCheckBB);
2635  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2636  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2637  llvm::Value *PL = CGF.EmitRuntimeCall(
2638  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2639  {RTLoc, ThreadID});
2640  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2641  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2642  CGF.EmitBlock(SeqBB);
2643  SeqGen(CGF, Action);
2644  CGF.EmitBranch(ExitBB);
2645  // There is no need to emit line number for unconditional branch.
2647  CGF.EmitBlock(MasterBB);
2648  L0ParallelGen(CGF, Action);
2649  CGF.EmitBranch(ExitBB);
2650  // There is no need to emit line number for unconditional branch.
2652  // Emit the continuation block for code after the if.
2653  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2654  }
2655  };
2656 
2657  if (IfCond) {
2658  emitIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2659  } else {
2661  RegionCodeGenTy ThenRCG(LNParallelGen);
2662  ThenRCG(CGF);
2663  }
2664 }
2665 
2666 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2667  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2668  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2669  // Just call the outlined function to execute the parallel region.
2670  // OutlinedFn(&GTid, &zero, CapturedStruct);
2671  //
2672  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2673 
2674  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2675  /*Name=*/".zero.addr");
2676  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2677  // ThreadId for serialized parallels is 0.
2678  Address ThreadIDAddr = ZeroAddr;
2679  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr](
2680  CodeGenFunction &CGF, PrePostActionTy &Action) {
2681  Action.Enter(CGF);
2682 
2683  Address ZeroAddr =
2684  CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2685  /*Name=*/".bound.zero.addr");
2686  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2687  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2688  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2689  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2690  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2691  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2692  };
2693  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2694  PrePostActionTy &) {
2695 
2696  RegionCodeGenTy RCG(CodeGen);
2697  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2698  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2699  llvm::Value *Args[] = {RTLoc, ThreadID};
2700 
2701  NVPTXActionTy Action(
2702  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2703  Args,
2704  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2705  Args);
2706  RCG.setAction(Action);
2707  RCG(CGF);
2708  };
2709 
2710  if (IsInTargetMasterThreadRegion) {
2711  // In the worker need to use the real thread id.
2712  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2713  RegionCodeGenTy RCG(CodeGen);
2714  RCG(CGF);
2715  } else {
2716  // If we are not in the target region, it is definitely L2 parallelism or
2717  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2718  // need to check for orphaned directives.
2719  RegionCodeGenTy RCG(SeqGen);
2720  RCG(CGF);
2721  }
2722 }
2723 
2724 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2725  // Always emit simple barriers!
2726  if (!CGF.HaveInsertPoint())
2727  return;
2728  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2729  // This function does not use parameters, so we can emit just default values.
2730  llvm::Value *Args[] = {
2731  llvm::ConstantPointerNull::get(
2732  cast<llvm::PointerType>(getIdentTyPointerTy())),
2733  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2734  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2735  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2736  Call->setConvergent();
2737 }
2738 
2740  SourceLocation Loc,
2741  OpenMPDirectiveKind Kind, bool,
2742  bool) {
2743  // Always emit simple barriers!
2744  if (!CGF.HaveInsertPoint())
2745  return;
2746  // Build call __kmpc_cancel_barrier(loc, thread_id);
2747  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2748  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2749  getThreadID(CGF, Loc)};
2750  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2752  Call->setConvergent();
2753 }
2754 
2756  CodeGenFunction &CGF, StringRef CriticalName,
2757  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2758  const Expr *Hint) {
2759  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2760  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2761  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2762  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2763  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2764 
2765  // Get the mask of active threads in the warp.
2766  llvm::Value *Mask = CGF.EmitRuntimeCall(
2767  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
2768  // Fetch team-local id of the thread.
2769  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2770 
2771  // Get the width of the team.
2772  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2773 
2774  // Initialize the counter variable for the loop.
2775  QualType Int32Ty =
2776  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2777  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2778  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2779  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2780  /*isInit=*/true);
2781 
2782  // Block checks if loop counter exceeds upper bound.
2783  CGF.EmitBlock(LoopBB);
2784  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2785  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2786  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2787 
2788  // Block tests which single thread should execute region, and which threads
2789  // should go straight to synchronisation point.
2790  CGF.EmitBlock(TestBB);
2791  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2792  llvm::Value *CmpThreadToCounter =
2793  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2794  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2795 
2796  // Block emits the body of the critical region.
2797  CGF.EmitBlock(BodyBB);
2798 
2799  // Output the critical statement.
2800  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2801  Hint);
2802 
2803  // After the body surrounded by the critical region, the single executing
2804  // thread will jump to the synchronisation point.
2805  // Block waits for all threads in current team to finish then increments the
2806  // counter variable and returns to the loop.
2807  CGF.EmitBlock(SyncBB);
2808  // Reconverge active threads in the warp.
2809  (void)CGF.EmitRuntimeCall(
2810  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask);
2811 
2812  llvm::Value *IncCounterVal =
2813  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2814  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2815  CGF.EmitBranch(LoopBB);
2816 
2817  // Block that is reached when all threads in the team complete the region.
2818  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2819 }
2820 
2821 /// Cast value to the specified type.
2823  QualType ValTy, QualType CastTy,
2824  SourceLocation Loc) {
2825  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2826  "Cast type must sized.");
2827  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2828  "Val type must sized.");
2829  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2830  if (ValTy == CastTy)
2831  return Val;
2832  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2833  CGF.getContext().getTypeSizeInChars(CastTy))
2834  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2835  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2836  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2837  CastTy->hasSignedIntegerRepresentation());
2838  Address CastItem = CGF.CreateMemTemp(CastTy);
2840  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2841  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2842  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2843 }
2844 
2845 /// This function creates calls to one of two shuffle functions to copy
2846 /// variables between lanes in a warp.
2848  llvm::Value *Elem,
2849  QualType ElemType,
2851  SourceLocation Loc) {
2852  CodeGenModule &CGM = CGF.CGM;
2853  CGBuilderTy &Bld = CGF.Builder;
2854  CGOpenMPRuntimeNVPTX &RT =
2855  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2856 
2857  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2858  assert(Size.getQuantity() <= 8 &&
2859  "Unsupported bitwidth in shuffle instruction.");
2860 
2861  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2862  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2863  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2864 
2865  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2866  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2867  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2868  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2869  llvm::Value *WarpSize =
2870  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2871 
2872  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2873  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2874 
2875  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2876 }
2877 
2878 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2879  Address DestAddr, QualType ElemType,
2881  CGBuilderTy &Bld = CGF.Builder;
2882 
2883  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2884  // Create the loop over the big sized data.
2885  // ptr = (void*)Elem;
2886  // ptrEnd = (void*) Elem + 1;
2887  // Step = 8;
2888  // while (ptr + Step < ptrEnd)
2889  // shuffle((int64_t)*ptr);
2890  // Step = 4;
2891  // while (ptr + Step < ptrEnd)
2892  // shuffle((int32_t)*ptr);
2893  // ...
2894  Address ElemPtr = DestAddr;
2895  Address Ptr = SrcAddr;
2897  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2898  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2899  if (Size < CharUnits::fromQuantity(IntSize))
2900  continue;
2901  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2902  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2903  /*Signed=*/1);
2904  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2905  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2906  ElemPtr =
2907  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2908  if (Size.getQuantity() / IntSize > 1) {
2909  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2910  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2911  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2912  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2913  CGF.EmitBlock(PreCondBB);
2914  llvm::PHINode *PhiSrc =
2915  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2916  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2917  llvm::PHINode *PhiDest =
2918  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2919  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2920  Ptr = Address(PhiSrc, Ptr.getAlignment());
2921  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2922  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2924  Ptr.getPointer(), CGF.VoidPtrTy));
2925  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2926  ThenBB, ExitBB);
2927  CGF.EmitBlock(ThenBB);
2929  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2930  IntType, Offset, Loc);
2931  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2932  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2933  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2934  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2935  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2936  CGF.EmitBranch(PreCondBB);
2937  CGF.EmitBlock(ExitBB);
2938  } else {
2940  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2941  IntType, Offset, Loc);
2942  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2943  Ptr = Bld.CreateConstGEP(Ptr, 1);
2944  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2945  }
2946  Size = Size % IntSize;
2947  }
2948 }
2949 
2950 namespace {
2951 enum CopyAction : unsigned {
2952  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2953  // the warp using shuffle instructions.
2954  RemoteLaneToThread,
2955  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2956  ThreadCopy,
2957  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2958  ThreadToScratchpad,
2959  // ScratchpadToThread: Copy from a scratchpad array in global memory
2960  // containing team-reduced data to a thread's stack.
2961  ScratchpadToThread,
2962 };
2963 } // namespace
2964 
2969 };
2970 
2971 /// Emit instructions to copy a Reduce list, which contains partially
2972 /// aggregated values, in the specified direction.
2974  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2975  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2976  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2977 
2978  CodeGenModule &CGM = CGF.CGM;
2979  ASTContext &C = CGM.getContext();
2980  CGBuilderTy &Bld = CGF.Builder;
2981 
2982  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2983  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2984  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2985 
2986  // Iterates, element-by-element, through the source Reduce list and
2987  // make a copy.
2988  unsigned Idx = 0;
2989  unsigned Size = Privates.size();
2990  for (const Expr *Private : Privates) {
2991  Address SrcElementAddr = Address::invalid();
2992  Address DestElementAddr = Address::invalid();
2993  Address DestElementPtrAddr = Address::invalid();
2994  // Should we shuffle in an element from a remote lane?
2995  bool ShuffleInElement = false;
2996  // Set to true to update the pointer in the dest Reduce list to a
2997  // newly created element.
2998  bool UpdateDestListPtr = false;
2999  // Increment the src or dest pointer to the scratchpad, for each
3000  // new element.
3001  bool IncrScratchpadSrc = false;
3002  bool IncrScratchpadDest = false;
3003 
3004  switch (Action) {
3005  case RemoteLaneToThread: {
3006  // Step 1.1: Get the address for the src element in the Reduce list.
3007  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3008  SrcElementAddr = CGF.EmitLoadOfPointer(
3009  SrcElementPtrAddr,
3010  C.getPointerType(Private->getType())->castAs<PointerType>());
3011 
3012  // Step 1.2: Create a temporary to store the element in the destination
3013  // Reduce list.
3014  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3015  DestElementAddr =
3016  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3017  ShuffleInElement = true;
3018  UpdateDestListPtr = true;
3019  break;
3020  }
3021  case ThreadCopy: {
3022  // Step 1.1: Get the address for the src element in the Reduce list.
3023  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3024  SrcElementAddr = CGF.EmitLoadOfPointer(
3025  SrcElementPtrAddr,
3026  C.getPointerType(Private->getType())->castAs<PointerType>());
3027 
3028  // Step 1.2: Get the address for dest element. The destination
3029  // element has already been created on the thread's stack.
3030  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3031  DestElementAddr = CGF.EmitLoadOfPointer(
3032  DestElementPtrAddr,
3033  C.getPointerType(Private->getType())->castAs<PointerType>());
3034  break;
3035  }
3036  case ThreadToScratchpad: {
3037  // Step 1.1: Get the address for the src element in the Reduce list.
3038  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3039  SrcElementAddr = CGF.EmitLoadOfPointer(
3040  SrcElementPtrAddr,
3041  C.getPointerType(Private->getType())->castAs<PointerType>());
3042 
3043  // Step 1.2: Get the address for dest element:
3044  // address = base + index * ElementSizeInChars.
3045  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3046  llvm::Value *CurrentOffset =
3047  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3048  llvm::Value *ScratchPadElemAbsolutePtrVal =
3049  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3050  ScratchPadElemAbsolutePtrVal =
3051  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3052  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3053  C.getTypeAlignInChars(Private->getType()));
3054  IncrScratchpadDest = true;
3055  break;
3056  }
3057  case ScratchpadToThread: {
3058  // Step 1.1: Get the address for the src element in the scratchpad.
3059  // address = base + index * ElementSizeInChars.
3060  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3061  llvm::Value *CurrentOffset =
3062  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3063  llvm::Value *ScratchPadElemAbsolutePtrVal =
3064  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3065  ScratchPadElemAbsolutePtrVal =
3066  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3067  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3068  C.getTypeAlignInChars(Private->getType()));
3069  IncrScratchpadSrc = true;
3070 
3071  // Step 1.2: Create a temporary to store the element in the destination
3072  // Reduce list.
3073  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3074  DestElementAddr =
3075  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3076  UpdateDestListPtr = true;
3077  break;
3078  }
3079  }
3080 
3081  // Regardless of src and dest of copy, we emit the load of src
3082  // element as this is required in all directions
3083  SrcElementAddr = Bld.CreateElementBitCast(
3084  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3085  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3086  SrcElementAddr.getElementType());
3087 
3088  // Now that all active lanes have read the element in the
3089  // Reduce list, shuffle over the value from the remote lane.
3090  if (ShuffleInElement) {
3091  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3092  RemoteLaneOffset, Private->getExprLoc());
3093  } else {
3094  switch (CGF.getEvaluationKind(Private->getType())) {
3095  case TEK_Scalar: {
3096  llvm::Value *Elem =
3097  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3098  Private->getType(), Private->getExprLoc());
3099  // Store the source element value to the dest element address.
3100  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3101  Private->getType());
3102  break;
3103  }
3104  case TEK_Complex: {
3106  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3107  Private->getExprLoc());
3108  CGF.EmitStoreOfComplex(
3109  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3110  /*isInit=*/false);
3111  break;
3112  }
3113  case TEK_Aggregate:
3114  CGF.EmitAggregateCopy(
3115  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3116  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3118  break;
3119  }
3120  }
3121 
3122  // Step 3.1: Modify reference in dest Reduce list as needed.
3123  // Modifying the reference in Reduce list to point to the newly
3124  // created element. The element is live in the current function
3125  // scope and that of functions it invokes (i.e., reduce_function).
3126  // RemoteReduceData[i] = (void*)&RemoteElem
3127  if (UpdateDestListPtr) {
3129  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3130  DestElementPtrAddr, /*Volatile=*/false,
3131  C.VoidPtrTy);
3132  }
3133 
3134  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3135  // address of the next element in scratchpad memory, unless we're currently
3136  // processing the last one. Memory alignment is also taken care of here.
3137  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3138  llvm::Value *ScratchpadBasePtr =
3139  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3140  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3141  ScratchpadBasePtr = Bld.CreateNUWAdd(
3142  ScratchpadBasePtr,
3143  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3144 
3145  // Take care of global memory alignment for performance
3146  ScratchpadBasePtr = Bld.CreateNUWSub(
3147  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3148  ScratchpadBasePtr = Bld.CreateUDiv(
3149  ScratchpadBasePtr,
3150  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3151  ScratchpadBasePtr = Bld.CreateNUWAdd(
3152  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3153  ScratchpadBasePtr = Bld.CreateNUWMul(
3154  ScratchpadBasePtr,
3155  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3156 
3157  if (IncrScratchpadDest)
3158  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3159  else /* IncrScratchpadSrc = true */
3160  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3161  }
3162 
3163  ++Idx;
3164  }
3165 }
3166 
3167 /// This function emits a helper that gathers Reduce lists from the first
3168 /// lane of every active warp to lanes in the first warp.
3169 ///
3170 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3171 /// shared smem[warp_size];
3172 /// For all data entries D in reduce_data:
3173 /// sync
3174 /// If (I am the first lane in each warp)
3175 /// Copy my local D to smem[warp_id]
3176 /// sync
3177 /// if (I am the first warp)
3178 /// Copy smem[thread_id] to my local D
3180  ArrayRef<const Expr *> Privates,
3181  QualType ReductionArrayTy,
3182  SourceLocation Loc) {
3183  ASTContext &C = CGM.getContext();
3184  llvm::Module &M = CGM.getModule();
3185 
3186  // ReduceList: thread local Reduce list.
3187  // At the stage of the computation when this function is called, partially
3188  // aggregated values reside in the first lane of every active warp.
3189  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3191  // NumWarps: number of warps active in the parallel region. This could
3192  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3193  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3194  C.getIntTypeForBitwidth(32, /* Signed */ true),
3196  FunctionArgList Args;
3197  Args.push_back(&ReduceListArg);
3198  Args.push_back(&NumWarpsArg);
3199 
3200  const CGFunctionInfo &CGFI =
3202  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3204  "_omp_reduction_inter_warp_copy_func", &M);
3205  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3206  Fn->setDoesNotRecurse();
3207  CodeGenFunction CGF(CGM);
3208  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3209 
3210  CGBuilderTy &Bld = CGF.Builder;
3211 
3212  // This array is used as a medium to transfer, one reduce element at a time,
3213  // the data from the first lane of every warp to lanes in the first warp
3214  // in order to perform the final step of a reduction in a parallel region
3215  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3216  // for reduced latency, as well as to have a distinct copy for concurrently
3217  // executing target regions. The array is declared with common linkage so
3218  // as to be shared across compilation units.
3219  StringRef TransferMediumName =
3220  "__openmp_nvptx_data_transfer_temporary_storage";
3221  llvm::GlobalVariable *TransferMedium =
3222  M.getGlobalVariable(TransferMediumName);
3223  if (!TransferMedium) {
3224  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3225  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3226  TransferMedium = new llvm::GlobalVariable(
3227  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3228  llvm::Constant::getNullValue(Ty), TransferMediumName,
3229  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3230  SharedAddressSpace);
3231  CGM.addCompilerUsedGlobal(TransferMedium);
3232  }
3233 
3234  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3235  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3236  // nvptx_lane_id = nvptx_id % warpsize
3237  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3238  // nvptx_warp_id = nvptx_id / warpsize
3239  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3240 
3241  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3242  Address LocalReduceList(
3244  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3245  C.VoidPtrTy, Loc),
3246  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3247  CGF.getPointerAlign());
3248 
3249  unsigned Idx = 0;
3250  for (const Expr *Private : Privates) {
3251  //
3252  // Warp master copies reduce element to transfer medium in __shared__
3253  // memory.
3254  //
3255  unsigned RealTySize =
3256  C.getTypeSizeInChars(Private->getType())
3257  .alignTo(C.getTypeAlignInChars(Private->getType()))
3258  .getQuantity();
3259  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3260  unsigned NumIters = RealTySize / TySize;
3261  if (NumIters == 0)
3262  continue;
3263  QualType CType = C.getIntTypeForBitwidth(
3264  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3265  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3266  CharUnits Align = CharUnits::fromQuantity(TySize);
3267  llvm::Value *Cnt = nullptr;
3268  Address CntAddr = Address::invalid();
3269  llvm::BasicBlock *PrecondBB = nullptr;
3270  llvm::BasicBlock *ExitBB = nullptr;
3271  if (NumIters > 1) {
3272  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3273  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3274  /*Volatile=*/false, C.IntTy);
3275  PrecondBB = CGF.createBasicBlock("precond");
3276  ExitBB = CGF.createBasicBlock("exit");
3277  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3278  // There is no need to emit line number for unconditional branch.
3280  CGF.EmitBlock(PrecondBB);
3281  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3282  llvm::Value *Cmp =
3283  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3284  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3285  CGF.EmitBlock(BodyBB);
3286  }
3287  // kmpc_barrier.
3288  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3289  /*EmitChecks=*/false,
3290  /*ForceSimpleCall=*/true);
3291  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3292  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3293  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3294 
3295  // if (lane_id == 0)
3296  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3297  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3298  CGF.EmitBlock(ThenBB);
3299 
3300  // Reduce element = LocalReduceList[i]
3301  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3302  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3303  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3304  // elemptr = ((CopyType*)(elemptrptr)) + I
3305  Address ElemPtr = Address(ElemPtrPtr, Align);
3306  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3307  if (NumIters > 1) {
3308  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3309  ElemPtr.getAlignment());
3310  }
3311 
3312  // Get pointer to location in transfer medium.
3313  // MediumPtr = &medium[warp_id]
3314  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3315  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3316  Address MediumPtr(MediumPtrVal, Align);
3317  // Casting to actual data type.
3318  // MediumPtr = (CopyType*)MediumPtrAddr;
3319  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3320 
3321  // elem = *elemptr
3322  //*MediumPtr = elem
3323  llvm::Value *Elem =
3324  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3325  // Store the source element value to the dest element address.
3326  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3327 
3328  Bld.CreateBr(MergeBB);
3329 
3330  CGF.EmitBlock(ElseBB);
3331  Bld.CreateBr(MergeBB);
3332 
3333  CGF.EmitBlock(MergeBB);
3334 
3335  // kmpc_barrier.
3336  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3337  /*EmitChecks=*/false,
3338  /*ForceSimpleCall=*/true);
3339 
3340  //
3341  // Warp 0 copies reduce element from transfer medium.
3342  //
3343  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3344  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3345  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3346 
3347  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3348  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3349  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3350 
3351  // Up to 32 threads in warp 0 are active.
3352  llvm::Value *IsActiveThread =
3353  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3354  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3355 
3356  CGF.EmitBlock(W0ThenBB);
3357 
3358  // SrcMediumPtr = &medium[tid]
3359  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3360  TransferMedium,
3361  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3362  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3363  // SrcMediumVal = *SrcMediumPtr;
3364  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3365 
3366  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3367  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3368  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3369  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3370  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3371  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3372  if (NumIters > 1) {
3373  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3374  TargetElemPtr.getAlignment());
3375  }
3376 
3377  // *TargetElemPtr = SrcMediumVal;
3378  llvm::Value *SrcMediumValue =
3379  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3380  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3381  CType);
3382  Bld.CreateBr(W0MergeBB);
3383 
3384  CGF.EmitBlock(W0ElseBB);
3385  Bld.CreateBr(W0MergeBB);
3386 
3387  CGF.EmitBlock(W0MergeBB);
3388 
3389  if (NumIters > 1) {
3390  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3391  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3392  CGF.EmitBranch(PrecondBB);
3394  CGF.EmitBlock(ExitBB);
3395  }
3396  RealTySize %= TySize;
3397  }
3398  ++Idx;
3399  }
3400 
3401  CGF.FinishFunction();
3402  return Fn;
3403 }
3404 
3405 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3406 /// in the same warp. It uses shuffle instructions to copy over data from
3407 /// a remote lane's stack. The reduction algorithm performed is specified
3408 /// by the fourth parameter.
3409 ///
3410 /// Algorithm Versions.
3411 /// Full Warp Reduce (argument value 0):
3412 /// This algorithm assumes that all 32 lanes are active and gathers
3413 /// data from these 32 lanes, producing a single resultant value.
3414 /// Contiguous Partial Warp Reduce (argument value 1):
3415 /// This algorithm assumes that only a *contiguous* subset of lanes
3416 /// are active. This happens for the last warp in a parallel region
3417 /// when the user specified num_threads is not an integer multiple of
3418 /// 32. This contiguous subset always starts with the zeroth lane.
3419 /// Partial Warp Reduce (argument value 2):
3420 /// This algorithm gathers data from any number of lanes at any position.
3421 /// All reduced values are stored in the lowest possible lane. The set
3422 /// of problems every algorithm addresses is a super set of those
3423 /// addressable by algorithms with a lower version number. Overhead
3424 /// increases as algorithm version increases.
3425 ///
3426 /// Terminology
3427 /// Reduce element:
3428 /// Reduce element refers to the individual data field with primitive
3429 /// data types to be combined and reduced across threads.
3430 /// Reduce list:
3431 /// Reduce list refers to a collection of local, thread-private
3432 /// reduce elements.
3433 /// Remote Reduce list:
3434 /// Remote Reduce list refers to a collection of remote (relative to
3435 /// the current thread) reduce elements.
3436 ///
3437 /// We distinguish between three states of threads that are important to
3438 /// the implementation of this function.
3439 /// Alive threads:
3440 /// Threads in a warp executing the SIMT instruction, as distinguished from
3441 /// threads that are inactive due to divergent control flow.
3442 /// Active threads:
3443 /// The minimal set of threads that has to be alive upon entry to this
3444 /// function. The computation is correct iff active threads are alive.
3445 /// Some threads are alive but they are not active because they do not
3446 /// contribute to the computation in any useful manner. Turning them off
3447 /// may introduce control flow overheads without any tangible benefits.
3448 /// Effective threads:
3449 /// In order to comply with the argument requirements of the shuffle
3450 /// function, we must keep all lanes holding data alive. But at most
3451 /// half of them perform value aggregation; we refer to this half of
3452 /// threads as effective. The other half is simply handing off their
3453 /// data.
3454 ///
3455 /// Procedure
3456 /// Value shuffle:
3457 /// In this step active threads transfer data from higher lane positions
3458 /// in the warp to lower lane positions, creating Remote Reduce list.
3459 /// Value aggregation:
3460 /// In this step, effective threads combine their thread local Reduce list
3461 /// with Remote Reduce list and store the result in the thread local
3462 /// Reduce list.
3463 /// Value copy:
3464 /// In this step, we deal with the assumption made by algorithm 2
3465 /// (i.e. contiguity assumption). When we have an odd number of lanes
3466 /// active, say 2k+1, only k threads will be effective and therefore k
3467 /// new values will be produced. However, the Reduce list owned by the
3468 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3469 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3470 /// that the contiguity assumption still holds.
3471 static llvm::Function *emitShuffleAndReduceFunction(
3472  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3473  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3474  ASTContext &C = CGM.getContext();
3475 
3476  // Thread local Reduce list used to host the values of data to be reduced.
3477  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3479  // Current lane id; could be logical.
3480  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3482  // Offset of the remote source lane relative to the current lane.
3483  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3485  // Algorithm version. This is expected to be known at compile time.
3486  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3488  FunctionArgList Args;
3489  Args.push_back(&ReduceListArg);
3490  Args.push_back(&LaneIDArg);
3491  Args.push_back(&RemoteLaneOffsetArg);
3492  Args.push_back(&AlgoVerArg);
3493 
3494  const CGFunctionInfo &CGFI =
3496  auto *Fn = llvm::Function::Create(
3498  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3499  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3500  Fn->setDoesNotRecurse();
3501  if (CGM.getLangOpts().Optimize) {
3502  Fn->removeFnAttr(llvm::Attribute::NoInline);
3503  Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
3504  Fn->addFnAttr(llvm::Attribute::AlwaysInline);
3505  }
3506 
3507  CodeGenFunction CGF(CGM);
3508  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3509 
3510  CGBuilderTy &Bld = CGF.Builder;
3511 
3512  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3513  Address LocalReduceList(
3515  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3516  C.VoidPtrTy, SourceLocation()),
3517  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3518  CGF.getPointerAlign());
3519 
3520  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3521  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3522  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3523 
3524  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3525  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3526  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3527 
3528  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3529  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3530  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3531 
3532  // Create a local thread-private variable to host the Reduce list
3533  // from a remote lane.
3534  Address RemoteReduceList =
3535  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3536 
3537  // This loop iterates through the list of reduce elements and copies,
3538  // element by element, from a remote lane in the warp to RemoteReduceList,
3539  // hosted on the thread's stack.
3540  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3541  LocalReduceList, RemoteReduceList,
3542  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3543  /*ScratchpadIndex=*/nullptr,
3544  /*ScratchpadWidth=*/nullptr});
3545 
3546  // The actions to be performed on the Remote Reduce list is dependent
3547  // on the algorithm version.
3548  //
3549  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3550  // LaneId % 2 == 0 && Offset > 0):
3551  // do the reduction value aggregation
3552  //
3553  // The thread local variable Reduce list is mutated in place to host the
3554  // reduced data, which is the aggregated value produced from local and
3555  // remote lanes.
3556  //
3557  // Note that AlgoVer is expected to be a constant integer known at compile
3558  // time.
3559  // When AlgoVer==0, the first conjunction evaluates to true, making
3560  // the entire predicate true during compile time.
3561  // When AlgoVer==1, the second conjunction has only the second part to be
3562  // evaluated during runtime. Other conjunctions evaluates to false
3563  // during compile time.
3564  // When AlgoVer==2, the third conjunction has only the second part to be
3565  // evaluated during runtime. Other conjunctions evaluates to false
3566  // during compile time.
3567  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3568 
3569  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3570  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3571  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3572 
3573  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3574  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3575  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3576  CondAlgo2 = Bld.CreateAnd(
3577  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3578 
3579  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3580  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3581 
3582  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3583  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3584  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3585  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3586 
3587  CGF.EmitBlock(ThenBB);
3588  // reduce_function(LocalReduceList, RemoteReduceList)
3589  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3590  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3591  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3592  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3593  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3594  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3595  Bld.CreateBr(MergeBB);
3596 
3597  CGF.EmitBlock(ElseBB);
3598  Bld.CreateBr(MergeBB);
3599 
3600  CGF.EmitBlock(MergeBB);
3601 
3602  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3603  // Reduce list.
3604  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3605  llvm::Value *CondCopy = Bld.CreateAnd(
3606  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3607 
3608  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3609  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3610  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3611  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3612 
3613  CGF.EmitBlock(CpyThenBB);
3614  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3615  RemoteReduceList, LocalReduceList);
3616  Bld.CreateBr(CpyMergeBB);
3617 
3618  CGF.EmitBlock(CpyElseBB);
3619  Bld.CreateBr(CpyMergeBB);
3620 
3621  CGF.EmitBlock(CpyMergeBB);
3622 
3623  CGF.FinishFunction();
3624  return Fn;
3625 }
3626 
3627 /// This function emits a helper that copies all the reduction variables from
3628 /// the team into the provided global buffer for the reduction variables.
3629 ///
3630 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3631 /// For all data entries D in reduce_data:
3632 /// Copy local D to buffer.D[Idx]
3634  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3635  QualType ReductionArrayTy, SourceLocation Loc,
3636  const RecordDecl *TeamReductionRec,
3637  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3638  &VarFieldMap) {
3639  ASTContext &C = CGM.getContext();
3640 
3641  // Buffer: global reduction buffer.
3642  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3644  // Idx: index of the buffer.
3645  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3647  // ReduceList: thread local Reduce list.
3648  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3650  FunctionArgList Args;
3651  Args.push_back(&BufferArg);
3652  Args.push_back(&IdxArg);
3653  Args.push_back(&ReduceListArg);
3654 
3655  const CGFunctionInfo &CGFI =
3657  auto *Fn = llvm::Function::Create(
3659  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3660  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3661  Fn->setDoesNotRecurse();
3662  CodeGenFunction CGF(CGM);
3663  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3664 
3665  CGBuilderTy &Bld = CGF.Builder;
3666 
3667  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3668  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3669  Address LocalReduceList(
3671  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3672  C.VoidPtrTy, Loc),
3673  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3674  CGF.getPointerAlign());
3675  QualType StaticTy = C.getRecordType(TeamReductionRec);
3676  llvm::Type *LLVMReductionsBufferTy =
3677  CGM.getTypes().ConvertTypeForMem(StaticTy);
3679  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3680  LLVMReductionsBufferTy->getPointerTo());
3681  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3682  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3683  /*Volatile=*/false, C.IntTy,
3684  Loc)};
3685  unsigned Idx = 0;
3686  for (const Expr *Private : Privates) {
3687  // Reduce element = LocalReduceList[i]
3688  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3689  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3690  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3691  // elemptr = ((CopyType*)(elemptrptr)) + I
3692  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3693  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3694  Address ElemPtr =
3695  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3696  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3697  // Global = Buffer.VD[Idx];
3698  const FieldDecl *FD = VarFieldMap.lookup(VD);
3699  LValue GlobLVal = CGF.EmitLValueForField(
3700  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3701  llvm::Value *BufferPtr =
3702  Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
3703  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3704  switch (CGF.getEvaluationKind(Private->getType())) {
3705  case TEK_Scalar: {
3706  llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3707  Private->getType(), Loc);
3708  CGF.EmitStoreOfScalar(V, GlobLVal);
3709  break;
3710  }
3711  case TEK_Complex: {
3713  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3714  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3715  break;
3716  }
3717  case TEK_Aggregate:
3718  CGF.EmitAggregateCopy(GlobLVal,
3719  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3721  break;
3722  }
3723  ++Idx;
3724  }
3725 
3726  CGF.FinishFunction();
3727  return Fn;
3728 }
3729 
3730 /// This function emits a helper that reduces all the reduction variables from
3731 /// the team into the provided global buffer for the reduction variables.
3732 ///
3733 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3734 /// void *GlobPtrs[];
3735 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3736 /// ...
3737 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3738 /// reduce_function(GlobPtrs, reduce_data);
3740  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3741  QualType ReductionArrayTy, SourceLocation Loc,
3742  const RecordDecl *TeamReductionRec,
3743  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3744  &VarFieldMap,
3745  llvm::Function *ReduceFn) {
3746  ASTContext &C = CGM.getContext();
3747 
3748  // Buffer: global reduction buffer.
3749  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3751  // Idx: index of the buffer.
3752  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3754  // ReduceList: thread local Reduce list.
3755  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3757  FunctionArgList Args;
3758  Args.push_back(&BufferArg);
3759  Args.push_back(&IdxArg);
3760  Args.push_back(&ReduceListArg);
3761 
3762  const CGFunctionInfo &CGFI =
3764  auto *Fn = llvm::Function::Create(
3766  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3767  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3768  Fn->setDoesNotRecurse();
3769  CodeGenFunction CGF(CGM);
3770  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3771 
3772  CGBuilderTy &Bld = CGF.Builder;
3773 
3774  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3775  QualType StaticTy = C.getRecordType(TeamReductionRec);
3776  llvm::Type *LLVMReductionsBufferTy =
3777  CGM.getTypes().ConvertTypeForMem(StaticTy);
3779  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3780  LLVMReductionsBufferTy->getPointerTo());
3781 
3782  // 1. Build a list of reduction variables.
3783  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3784  Address ReductionList =
3785  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3786  auto IPriv = Privates.begin();
3787  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3788  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3789  /*Volatile=*/false, C.IntTy,
3790  Loc)};
3791  unsigned Idx = 0;
3792  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3793  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3794  // Global = Buffer.VD[Idx];
3795  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3796  const FieldDecl *FD = VarFieldMap.lookup(VD);
3797  LValue GlobLVal = CGF.EmitLValueForField(
3798  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3799  llvm::Value *BufferPtr =
3800  Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
3801  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3802  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3803  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3804  // Store array size.
3805  ++Idx;
3806  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3807  llvm::Value *Size = CGF.Builder.CreateIntCast(
3808  CGF.getVLASize(
3809  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3810  .NumElts,
3811  CGF.SizeTy, /*isSigned=*/false);
3812  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3813  Elem);
3814  }
3815  }
3816 
3817  // Call reduce_function(GlobalReduceList, ReduceList)
3818  llvm::Value *GlobalReduceList =
3819  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3820  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3821  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3822  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3823  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3824  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3825  CGF.FinishFunction();
3826  return Fn;
3827 }
3828 
3829 /// This function emits a helper that copies all the reduction variables from
3830 /// the team into the provided global buffer for the reduction variables.
3831 ///
3832 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3833 /// For all data entries D in reduce_data:
3834 /// Copy buffer.D[Idx] to local D;
3836  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3837  QualType ReductionArrayTy, SourceLocation Loc,
3838  const RecordDecl *TeamReductionRec,
3839  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3840  &VarFieldMap) {
3841  ASTContext &C = CGM.getContext();
3842 
3843  // Buffer: global reduction buffer.
3844  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3846  // Idx: index of the buffer.
3847  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3849  // ReduceList: thread local Reduce list.
3850  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3852  FunctionArgList Args;
3853  Args.push_back(&BufferArg);
3854  Args.push_back(&IdxArg);
3855  Args.push_back(&ReduceListArg);
3856 
3857  const CGFunctionInfo &CGFI =
3859  auto *Fn = llvm::Function::Create(
3861  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3862  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3863  Fn->setDoesNotRecurse();
3864  CodeGenFunction CGF(CGM);
3865  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3866 
3867  CGBuilderTy &Bld = CGF.Builder;
3868 
3869  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3870  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3871  Address LocalReduceList(
3873  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3874  C.VoidPtrTy, Loc),
3875  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3876  CGF.getPointerAlign());
3877  QualType StaticTy = C.getRecordType(TeamReductionRec);
3878  llvm::Type *LLVMReductionsBufferTy =
3879  CGM.getTypes().ConvertTypeForMem(StaticTy);
3881  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3882  LLVMReductionsBufferTy->getPointerTo());
3883 
3884  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3885  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3886  /*Volatile=*/false, C.IntTy,
3887  Loc)};
3888  unsigned Idx = 0;
3889  for (const Expr *Private : Privates) {
3890  // Reduce element = LocalReduceList[i]
3891  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3892  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3893  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3894  // elemptr = ((CopyType*)(elemptrptr)) + I
3895  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3896  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3897  Address ElemPtr =
3898  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3899  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3900  // Global = Buffer.VD[Idx];
3901  const FieldDecl *FD = VarFieldMap.lookup(VD);
3902  LValue GlobLVal = CGF.EmitLValueForField(
3903  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3904  llvm::Value *BufferPtr =
3905  Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
3906  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3907  switch (CGF.getEvaluationKind(Private->getType())) {
3908  case TEK_Scalar: {
3909  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3910  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3911  break;
3912  }
3913  case TEK_Complex: {
3914  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3915  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3916  /*isInit=*/false);
3917  break;
3918  }
3919  case TEK_Aggregate:
3920  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3921  GlobLVal, Private->getType(),
3923  break;
3924  }
3925  ++Idx;
3926  }
3927 
3928  CGF.FinishFunction();
3929  return Fn;
3930 }
3931 
3932 /// This function emits a helper that reduces all the reduction variables from
3933 /// the team into the provided global buffer for the reduction variables.
3934 ///
3935 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3936 /// void *GlobPtrs[];
3937 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3938 /// ...
3939 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3940 /// reduce_function(reduce_data, GlobPtrs);
3942  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3943  QualType ReductionArrayTy, SourceLocation Loc,
3944  const RecordDecl *TeamReductionRec,
3945  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3946  &VarFieldMap,
3947  llvm::Function *ReduceFn) {
3948  ASTContext &C = CGM.getContext();
3949 
3950  // Buffer: global reduction buffer.
3951  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3953  // Idx: index of the buffer.
3954  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3956  // ReduceList: thread local Reduce list.
3957  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3959  FunctionArgList Args;
3960  Args.push_back(&BufferArg);
3961  Args.push_back(&IdxArg);
3962  Args.push_back(&ReduceListArg);
3963 
3964  const CGFunctionInfo &CGFI =
3966  auto *Fn = llvm::Function::Create(
3968  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3969  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3970  Fn->setDoesNotRecurse();
3971  CodeGenFunction CGF(CGM);
3972  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3973 
3974  CGBuilderTy &Bld = CGF.Builder;
3975 
3976  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3977  QualType StaticTy = C.getRecordType(TeamReductionRec);
3978  llvm::Type *LLVMReductionsBufferTy =
3979  CGM.getTypes().ConvertTypeForMem(StaticTy);
3981  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3982  LLVMReductionsBufferTy->getPointerTo());
3983 
3984  // 1. Build a list of reduction variables.
3985  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3986  Address ReductionList =
3987  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3988  auto IPriv = Privates.begin();
3989  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3990  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3991  /*Volatile=*/false, C.IntTy,
3992  Loc)};
3993  unsigned Idx = 0;
3994  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3995  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3996  // Global = Buffer.VD[Idx];
3997  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3998  const FieldDecl *FD = VarFieldMap.lookup(VD);
3999  LValue GlobLVal = CGF.EmitLValueForField(
4000  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
4001  llvm::Value *BufferPtr =
4002  Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
4003  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
4004  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
4005  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4006  // Store array size.
4007  ++Idx;
4008  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4009  llvm::Value *Size = CGF.Builder.CreateIntCast(
4010  CGF.getVLASize(
4011  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4012  .NumElts,
4013  CGF.SizeTy, /*isSigned=*/false);
4014  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4015  Elem);
4016  }
4017  }
4018 
4019  // Call reduce_function(ReduceList, GlobalReduceList)
4020  llvm::Value *GlobalReduceList =
4021  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
4022  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
4023  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
4024  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
4025  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
4026  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
4027  CGF.FinishFunction();
4028  return Fn;
4029 }
4030 
4031 ///
4032 /// Design of OpenMP reductions on the GPU
4033 ///
4034 /// Consider a typical OpenMP program with one or more reduction
4035 /// clauses:
4036 ///
4037 /// float foo;
4038 /// double bar;
4039 /// #pragma omp target teams distribute parallel for \
4040 /// reduction(+:foo) reduction(*:bar)
4041 /// for (int i = 0; i < N; i++) {
4042 /// foo += A[i]; bar *= B[i];
4043 /// }
4044 ///
4045 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
4046 /// all teams. In our OpenMP implementation on the NVPTX device an
4047 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4048 /// within a team are mapped to CUDA threads within a threadblock.
4049 /// Our goal is to efficiently aggregate values across all OpenMP
4050 /// threads such that:
4051 ///
4052 /// - the compiler and runtime are logically concise, and
4053 /// - the reduction is performed efficiently in a hierarchical
4054 /// manner as follows: within OpenMP threads in the same warp,
4055 /// across warps in a threadblock, and finally across teams on
4056 /// the NVPTX device.
4057 ///
4058 /// Introduction to Decoupling
4059 ///
4060 /// We would like to decouple the compiler and the runtime so that the
4061 /// latter is ignorant of the reduction variables (number, data types)
4062 /// and the reduction operators. This allows a simpler interface
4063 /// and implementation while still attaining good performance.
4064 ///
4065 /// Pseudocode for the aforementioned OpenMP program generated by the
4066 /// compiler is as follows:
4067 ///
4068 /// 1. Create private copies of reduction variables on each OpenMP
4069 /// thread: 'foo_private', 'bar_private'
4070 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4071 /// to it and writes the result in 'foo_private' and 'bar_private'
4072 /// respectively.
4073 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4074 /// and store the result on the team master:
4075 ///
4076 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4077 /// reduceData, shuffleReduceFn, interWarpCpyFn)
4078 ///
4079 /// where:
4080 /// struct ReduceData {
4081 /// double *foo;
4082 /// double *bar;
4083 /// } reduceData
4084 /// reduceData.foo = &foo_private
4085 /// reduceData.bar = &bar_private
4086 ///
4087 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4088 /// auxiliary functions generated by the compiler that operate on
4089 /// variables of type 'ReduceData'. They aid the runtime perform
4090 /// algorithmic steps in a data agnostic manner.
4091 ///
4092 /// 'shuffleReduceFn' is a pointer to a function that reduces data
4093 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
4094 /// same warp. It takes the following arguments as input:
4095 ///
4096 /// a. variable of type 'ReduceData' on the calling lane,
4097 /// b. its lane_id,
4098 /// c. an offset relative to the current lane_id to generate a
4099 /// remote_lane_id. The remote lane contains the second
4100 /// variable of type 'ReduceData' that is to be reduced.
4101 /// d. an algorithm version parameter determining which reduction
4102 /// algorithm to use.
4103 ///
4104 /// 'shuffleReduceFn' retrieves data from the remote lane using
4105 /// efficient GPU shuffle intrinsics and reduces, using the
4106 /// algorithm specified by the 4th parameter, the two operands
4107 /// element-wise. The result is written to the first operand.
4108 ///
4109 /// Different reduction algorithms are implemented in different
4110 /// runtime functions, all calling 'shuffleReduceFn' to perform
4111 /// the essential reduction step. Therefore, based on the 4th
4112 /// parameter, this function behaves slightly differently to
4113 /// cooperate with the runtime to ensure correctness under
4114 /// different circumstances.
4115 ///
4116 /// 'InterWarpCpyFn' is a pointer to a function that transfers
4117 /// reduced variables across warps. It tunnels, through CUDA
4118 /// shared memory, the thread-private data of type 'ReduceData'
4119 /// from lane 0 of each warp to a lane in the first warp.
4120 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4121 /// The last team writes the global reduced value to memory.
4122 ///
4123 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4124 /// reduceData, shuffleReduceFn, interWarpCpyFn,
4125 /// scratchpadCopyFn, loadAndReduceFn)
4126 ///
4127 /// 'scratchpadCopyFn' is a helper that stores reduced
4128 /// data from the team master to a scratchpad array in
4129 /// global memory.
4130 ///
4131 /// 'loadAndReduceFn' is a helper that loads data from
4132 /// the scratchpad array and reduces it with the input
4133 /// operand.
4134 ///
4135 /// These compiler generated functions hide address
4136 /// calculation and alignment information from the runtime.
4137 /// 5. if ret == 1:
4138 /// The team master of the last team stores the reduced
4139 /// result to the globals in memory.
4140 /// foo += reduceData.foo; bar *= reduceData.bar
4141 ///
4142 ///
4143 /// Warp Reduction Algorithms
4144 ///
4145 /// On the warp level, we have three algorithms implemented in the
4146 /// OpenMP runtime depending on the number of active lanes:
4147 ///
4148 /// Full Warp Reduction
4149 ///
4150 /// The reduce algorithm within a warp where all lanes are active
4151 /// is implemented in the runtime as follows:
4152 ///
4153 /// full_warp_reduce(void *reduce_data,
4154 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4155 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4156 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
4157 /// }
4158 ///
4159 /// The algorithm completes in log(2, WARPSIZE) steps.
4160 ///
4161 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4162 /// not used therefore we save instructions by not retrieving lane_id
4163 /// from the corresponding special registers. The 4th parameter, which
4164 /// represents the version of the algorithm being used, is set to 0 to
4165 /// signify full warp reduction.
4166 ///
4167 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4168 ///
4169 /// #reduce_elem refers to an element in the local lane's data structure
4170 /// #remote_elem is retrieved from a remote lane
4171 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4172 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4173 ///
4174 /// Contiguous Partial Warp Reduction
4175 ///
4176 /// This reduce algorithm is used within a warp where only the first
4177 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4178 /// number of OpenMP threads in a parallel region is not a multiple of
4179 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
4180 ///
4181 /// void
4182 /// contiguous_partial_reduce(void *reduce_data,
4183 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4184 /// int size, int lane_id) {
4185 /// int curr_size;
4186 /// int offset;
4187 /// curr_size = size;
4188 /// mask = curr_size/2;
4189 /// while (offset>0) {
4190 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4191 /// curr_size = (curr_size+1)/2;
4192 /// offset = curr_size/2;
4193 /// }
4194 /// }
4195 ///
4196 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4197 ///
4198 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4199 /// if (lane_id < offset)
4200 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4201 /// else
4202 /// reduce_elem = remote_elem
4203 ///
4204 /// This algorithm assumes that the data to be reduced are located in a
4205 /// contiguous subset of lanes starting from the first. When there is
4206 /// an odd number of active lanes, the data in the last lane is not
4207 /// aggregated with any other lane's dat but is instead copied over.
4208 ///
4209 /// Dispersed Partial Warp Reduction
4210 ///
4211 /// This algorithm is used within a warp when any discontiguous subset of
4212 /// lanes are active. It is used to implement the reduction operation
4213 /// across lanes in an OpenMP simd region or in a nested parallel region.
4214 ///
4215 /// void
4216 /// dispersed_partial_reduce(void *reduce_data,
4217 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4218 /// int size, remote_id;
4219 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4220 /// do {
4221 /// remote_id = next_active_lane_id_right_after_me();
4222 /// # the above function returns 0 of no active lane
4223 /// # is present right after the current lane.
4224 /// size = number_of_active_lanes_in_this_warp();
4225 /// logical_lane_id /= 2;
4226 /// ShuffleReduceFn(reduce_data, logical_lane_id,
4227 /// remote_id-1-threadIdx.x, 2);
4228 /// } while (logical_lane_id % 2 == 0 && size > 1);
4229 /// }
4230 ///
4231 /// There is no assumption made about the initial state of the reduction.
4232 /// Any number of lanes (>=1) could be active at any position. The reduction
4233 /// result is returned in the first active lane.
4234 ///
4235 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4236 ///
4237 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4238 /// if (lane_id % 2 == 0 && offset > 0)
4239 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4240 /// else
4241 /// reduce_elem = remote_elem
4242 ///
4243 ///
4244 /// Intra-Team Reduction
4245 ///
4246 /// This function, as implemented in the runtime call
4247 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4248 /// threads in a team. It first reduces within a warp using the
4249 /// aforementioned algorithms. We then proceed to gather all such
4250 /// reduced values at the first warp.
4251 ///
4252 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4253 /// data from each of the "warp master" (zeroth lane of each warp, where
4254 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
4255 /// a mathematical sense) the problem of reduction across warp masters in
4256 /// a block to the problem of warp reduction.
4257 ///
4258 ///
4259 /// Inter-Team Reduction
4260 ///
4261 /// Once a team has reduced its data to a single value, it is stored in
4262 /// a global scratchpad array. Since each team has a distinct slot, this
4263 /// can be done without locking.
4264 ///
4265 /// The last team to write to the scratchpad array proceeds to reduce the
4266 /// scratchpad array. One or more workers in the last team use the helper
4267 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4268 /// the k'th worker reduces every k'th element.
4269 ///
4270 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4271 /// reduce across workers and compute a globally reduced value.
4272 ///
4276  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4277  if (!CGF.HaveInsertPoint())
4278  return;
4279 
4280  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4281 #ifndef NDEBUG
4282  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4283 #endif
4284 
4285  if (Options.SimpleReduction) {
4286  assert(!TeamsReduction && !ParallelReduction &&
4287  "Invalid reduction selection in emitReduction.");
4288  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4289  ReductionOps, Options);
4290  return;
4291  }
4292 
4293  assert((TeamsReduction || ParallelReduction) &&
4294  "Invalid reduction selection in emitReduction.");
4295 
4296  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4297  // RedList, shuffle_reduce_func, interwarp_copy_func);
4298  // or
4299  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4300  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4301  llvm::Value *ThreadId = getThreadID(CGF, Loc);
4302 
4303  llvm::Value *Res;
4304  ASTContext &C = CGM.getContext();
4305  // 1. Build a list of reduction variables.
4306  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4307  auto Size = RHSExprs.size();
4308  for (const Expr *E : Privates) {
4309  if (E->getType()->isVariablyModifiedType())
4310  // Reserve place for array size.
4311  ++Size;
4312  }
4313  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4314  QualType ReductionArrayTy =
4315  C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
4316  /*IndexTypeQuals=*/0);
4317  Address ReductionList =
4318  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4319  auto IPriv = Privates.begin();
4320  unsigned Idx = 0;
4321  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4322  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4323  CGF.Builder.CreateStore(
4325  CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
4326  Elem);
4327  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4328  // Store array size.
4329  ++Idx;
4330  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4331  llvm::Value *Size = CGF.Builder.CreateIntCast(
4332  CGF.getVLASize(
4333  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4334  .NumElts,
4335  CGF.SizeTy, /*isSigned=*/false);
4336  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4337  Elem);
4338  }
4339  }
4340 
4342  ReductionList.getPointer(), CGF.VoidPtrTy);
4343  llvm::Function *ReductionFn = emitReductionFunction(
4344  Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4345  LHSExprs, RHSExprs, ReductionOps);
4346  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4347  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4348  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4349  llvm::Value *InterWarpCopyFn =
4350  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4351 
4352  if (ParallelReduction) {
4353  llvm::Value *Args[] = {RTLoc,
4354  ThreadId,
4355  CGF.Builder.getInt32(RHSExprs.size()),
4356  ReductionArrayTySize,
4357  RL,
4358  ShuffleAndReduceFn,
4359  InterWarpCopyFn};
4360 
4361  Res = CGF.EmitRuntimeCall(
4363  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4364  Args);
4365  } else {
4366  assert(TeamsReduction && "expected teams reduction.");
4367  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4368  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4369  int Cnt = 0;
4370  for (const Expr *DRE : Privates) {
4371  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4372  ++Cnt;
4373  }
4374  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4375  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4376  C.getLangOpts().OpenMPCUDAReductionBufNum);
4377  TeamsReductions.push_back(TeamReductionRec);
4378  if (!KernelTeamsReductionPtr) {
4379  KernelTeamsReductionPtr = new llvm::GlobalVariable(
4380  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4382  "_openmp_teams_reductions_buffer_$_$ptr");
4383  }
4384  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4385  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4386  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4387  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4388  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4389  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4390  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4391  ReductionFn);
4392  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4393  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4394  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4395  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4396  ReductionFn);
4397 
4398  llvm::Value *Args[] = {
4399  RTLoc,
4400  ThreadId,
4401  GlobalBufferPtr,
4402  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4403  RL,
4404  ShuffleAndReduceFn,
4405  InterWarpCopyFn,
4406  GlobalToBufferCpyFn,
4407  GlobalToBufferRedFn,
4408  BufferToGlobalCpyFn,
4409  BufferToGlobalRedFn};
4410 
4411  Res = CGF.EmitRuntimeCall(
4413  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4414  Args);
4415  }
4416 
4417  // 5. Build if (res == 1)
4418  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4419  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4420  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4421  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4422  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4423 
4424  // 6. Build then branch: where we have reduced values in the master
4425  // thread in each team.
4426  // __kmpc_end_reduce{_nowait}(<gtid>);
4427  // break;
4428  CGF.EmitBlock(ThenBB);
4429 
4430  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4431  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4432  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4433  auto IPriv = Privates.begin();
4434  auto ILHS = LHSExprs.begin();
4435  auto IRHS = RHSExprs.begin();
4436  for (const Expr *E : ReductionOps) {
4437  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4438  cast<DeclRefExpr>(*IRHS));
4439  ++IPriv;
4440  ++ILHS;
4441  ++IRHS;
4442  }
4443  };
4444  llvm::Value *EndArgs[] = {ThreadId};
4445  RegionCodeGenTy RCG(CodeGen);
4446  NVPTXActionTy Action(
4447  nullptr, llvm::None,
4448  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4449  EndArgs);
4450  RCG.setAction(Action);
4451  RCG(CGF);
4452  // There is no need to emit line number for unconditional branch.
4454  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4455 }
4456 
4457 const VarDecl *
4459  const VarDecl *NativeParam) const {
4460  if (!NativeParam->getType()->isReferenceType())
4461  return NativeParam;
4462  QualType ArgType = NativeParam->getType();
4463  QualifierCollector QC;
4464  const Type *NonQualTy = QC.strip(ArgType);
4465  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4466  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4467  if (Attr->getCaptureKind() == OMPC_map) {
4468  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4470  } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4471  PointeeTy.isConstant(CGM.getContext())) {
4472  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4474  }
4475  }
4476  ArgType = CGM.getContext().getPointerType(PointeeTy);
4477  QC.addRestrict();
4478  enum { NVPTX_local_addr = 5 };
4479  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4480  ArgType = QC.apply(CGM.getContext(), ArgType);
4481  if (isa<ImplicitParamDecl>(NativeParam))
4483  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4484  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4485  return ParmVarDecl::Create(
4486  CGM.getContext(),
4487  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4488  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4489  NativeParam->getIdentifier(), ArgType,
4490  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4491 }
4492 
4493 Address
4495  const VarDecl *NativeParam,
4496  const VarDecl *TargetParam) const {
4497  assert(NativeParam != TargetParam &&
4498  NativeParam->getType()->isReferenceType() &&
4499  "Native arg must not be the same as target arg.");
4500  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4501  QualType NativeParamType = NativeParam->getType();
4502  QualifierCollector QC;
4503  const Type *NonQualTy = QC.strip(NativeParamType);
4504  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4505  unsigned NativePointeeAddrSpace =
4506  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4507  QualType TargetTy = TargetParam->getType();
4508  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4509  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4510  // First cast to generic.
4512  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4513  /*AddrSpace=*/0));
4514  // Cast from generic to native address space.
4516  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4517  NativePointeeAddrSpace));
4518  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4519  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4520  NativeParamType);
4521  return NativeParamAddr;
4522 }
4523 
4525  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4526  ArrayRef<llvm::Value *> Args) const {
4527  SmallVector<llvm::Value *, 4> TargetArgs;
4528  TargetArgs.reserve(Args.size());
4529  auto *FnType = OutlinedFn.getFunctionType();
4530  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4531  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4532  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4533  break;
4534  }
4535  llvm::Type *TargetType = FnType->getParamType(I);
4536  llvm::Value *NativeArg = Args[I];
4537  if (!TargetType->isPointerTy()) {
4538  TargetArgs.emplace_back(NativeArg);
4539  continue;
4540  }
4542  NativeArg,
4543  NativeArg->getType()->getPointerElementType()->getPointerTo());
4544  TargetArgs.emplace_back(
4545  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4546  }
4547  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4548 }
4549 
4550 /// Emit function which wraps the outline parallel region
4551 /// and controls the arguments which are passed to this function.
4552 /// The wrapper ensures that the outlined function is called
4553 /// with the correct arguments when data is shared.
4554 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4555  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4556  ASTContext &Ctx = CGM.getContext();
4557  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4558 
4559  // Create a function that takes as argument the source thread.
4560  FunctionArgList WrapperArgs;
4561  QualType Int16QTy =
4562  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4563  QualType Int32QTy =
4564  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4565  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4566  /*Id=*/nullptr, Int16QTy,
4568  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4569  /*Id=*/nullptr, Int32QTy,
4571  WrapperArgs.emplace_back(&ParallelLevelArg);
4572  WrapperArgs.emplace_back(&WrapperArg);
4573 
4574  const CGFunctionInfo &CGFI =
4575  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4576 
4577  auto *Fn = llvm::Function::Create(
4579  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4580  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4581  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4582  Fn->setDoesNotRecurse();
4583 
4584  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4585  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4586  D.getBeginLoc(), D.getBeginLoc());
4587 
4588  const auto *RD = CS.getCapturedRecordDecl();
4589  auto CurField = RD->field_begin();
4590 
4591  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4592  /*Name=*/".zero.addr");
4593  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4594  // Get the array of arguments.
4596 
4597  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4598  Args.emplace_back(ZeroAddr.getPointer());
4599 
4600  CGBuilderTy &Bld = CGF.Builder;
4601  auto CI = CS.capture_begin();
4602 
4603  // Use global memory for data sharing.
4604  // Handle passing of global args to workers.
4605  Address GlobalArgs =
4606  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4607  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4608  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4609  CGF.EmitRuntimeCall(
4610  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4611  DataSharingArgs);
4612 
4613  // Retrieve the shared variables from the list of references returned
4614  // by the runtime. Pass the variables to the outlined function.
4615  Address SharedArgListAddress = Address::invalid();
4616  if (CS.capture_size() > 0 ||
4618  SharedArgListAddress = CGF.EmitLoadOfPointer(
4619  GlobalArgs, CGF.getContext()
4621  CGF.getContext().VoidPtrTy))
4622  .castAs<PointerType>());
4623  }
4624  unsigned Idx = 0;
4626  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4627  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4628  Src, CGF.SizeTy->getPointerTo());
4629  llvm::Value *LB = CGF.EmitLoadOfScalar(
4630  TypedAddress,
4631  /*Volatile=*/false,
4633  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4634  Args.emplace_back(LB);
4635  ++Idx;
4636  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4637  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4638  Src, CGF.SizeTy->getPointerTo());
4639  llvm::Value *UB = CGF.EmitLoadOfScalar(
4640  TypedAddress,
4641  /*Volatile=*/false,
4643  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4644  Args.emplace_back(UB);
4645  ++Idx;
4646  }
4647  if (CS.capture_size() > 0) {
4648  ASTContext &CGFContext = CGF.getContext();
4649  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4650  QualType ElemTy = CurField->getType();
4651  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4652  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4653  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4654  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4655  /*Volatile=*/false,
4656  CGFContext.getPointerType(ElemTy),
4657  CI->getLocation());
4658  if (CI->capturesVariableByCopy() &&
4659  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4660  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4661  CI->getLocation());
4662  }
4663  Args.emplace_back(Arg);
4664  }
4665  }
4666 
4667  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4668  CGF.FinishFunction();
4669  return Fn;
4670 }
4671 
4673  const Decl *D) {
4675  return;
4676 
4677  assert(D && "Expected function or captured|block decl.");
4678  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4679  "Function is registered already.");
4680  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4681  "Team is set but not processed.");
4682  const Stmt *Body = nullptr;
4683  bool NeedToDelayGlobalization = false;
4684  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4685  Body = FD->getBody();
4686  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4687  Body = BD->getBody();
4688  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4689  Body = CD->getBody();
4690  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4691  if (NeedToDelayGlobalization &&
4692  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4693  return;
4694  }
4695  if (!Body)
4696  return;
4697  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4698  VarChecker.Visit(Body);
4699  const RecordDecl *GlobalizedVarsRecord =
4700  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4701  TeamAndReductions.first = nullptr;
4702  TeamAndReductions.second.clear();
4703  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4704  VarChecker.getEscapedVariableLengthDecls();
4705  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4706  return;
4707  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4708  I->getSecond().MappedParams =
4709  std::make_unique<CodeGenFunction::OMPMapVars>();
4710  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4711  I->getSecond().EscapedParameters.insert(
4712  VarChecker.getEscapedParameters().begin(),
4713  VarChecker.getEscapedParameters().end());
4714  I->getSecond().EscapedVariableLengthDecls.append(
4715  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4716  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4717  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4718  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4719  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4720  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4721  }
4722  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4723  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4724  VarChecker.Visit(Body);
4725  I->getSecond().SecondaryGlobalRecord =
4726  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4727  I->getSecond().SecondaryLocalVarData.emplace();
4728  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4729  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4730  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4731  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4732  Data.insert(
4733  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4734  }
4735  }
4736  if (!NeedToDelayGlobalization) {
4737  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4738  struct GlobalizationScope final : EHScopeStack::Cleanup {
4739  GlobalizationScope() = default;
4740 
4741  void Emit(CodeGenFunction &CGF, Flags flags) override {
4742  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4743  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4744  }
4745  };
4746  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4747  }
4748 }
4749 
4751  const VarDecl *VD) {
4752  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4753  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4754  switch (A->getAllocatorType()) {
4755  // Use the default allocator here as by default local vars are
4756  // threadlocal.
4757  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4758  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4759  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4760  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4761  // Follow the user decision - use default allocation.
4762  return Address::invalid();
4763  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4764  // TODO: implement aupport for user-defined allocators.
4765  return Address::invalid();
4766  case OMPAllocateDeclAttr::OMPConstMemAlloc: {
4767  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4768  auto *GV = new llvm::GlobalVariable(
4769  CGM.getModule(), VarTy, /*isConstant=*/false,
4771  llvm::Constant::getNullValue(VarTy), VD->getName(),
4772  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4774  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4775  GV->setAlignment(Align.getAsAlign());
4776  return Address(GV, Align);
4777  }
4778  case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
4779  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4780  auto *GV = new llvm::GlobalVariable(
4781  CGM.getModule(), VarTy, /*isConstant=*/false,
4783  llvm::Constant::getNullValue(VarTy), VD->getName(),
4784  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4786  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4787  GV->setAlignment(Align.getAsAlign());
4788  return Address(GV, Align);
4789  }
4790  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4791  case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
4792  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4793  auto *GV = new llvm::GlobalVariable(
4794  CGM.getModule(), VarTy, /*isConstant=*/false,
4796  llvm::Constant::getNullValue(VarTy), VD->getName());
4797  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4798  GV->setAlignment(Align.getAsAlign());
4799  return Address(GV, Align);
4800  }
4801  }
4802  }
4803 
4805  return Address::invalid();
4806 
4807  VD = VD->getCanonicalDecl();
4808  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4809  if (I == FunctionGlobalizedDecls.end())
4810  return Address::invalid();
4811  auto VDI = I->getSecond().LocalVarData.find(VD);
4812  if (VDI != I->getSecond().LocalVarData.end())
4813  return VDI->second.PrivateAddr;
4814  if (VD->hasAttrs()) {
4816  E(VD->attr_end());
4817  IT != E; ++IT) {
4818  auto VDI = I->getSecond().LocalVarData.find(
4819  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4820  ->getCanonicalDecl());
4821  if (VDI != I->getSecond().LocalVarData.end())
4822  return VDI->second.PrivateAddr;
4823  }
4824  }
4825 
4826  return Address::invalid();
4827 }
4828 
4830  FunctionGlobalizedDecls.erase(CGF.CurFn);
4832 }
4833 
4835  CodeGenFunction &CGF, const OMPLoopDirective &S,
4836  OpenMPDistScheduleClauseKind &ScheduleKind,
4837  llvm::Value *&Chunk) const {
4838  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4839  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4840  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4841  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4843  return;
4844  }
4846  CGF, S, ScheduleKind, Chunk);
4847 }
4848 
4850  CodeGenFunction &CGF, const OMPLoopDirective &S,
4851  OpenMPScheduleClauseKind &ScheduleKind,
4852  const Expr *&ChunkExpr) const {
4853  ScheduleKind = OMPC_SCHEDULE_static;
4854  // Chunk size is 1 in this case.
4855  llvm::APInt ChunkSize(32, 1);
4856  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4857  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4858  SourceLocation());
4859 }
4860 
4862  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4864  " Expected target-based directive.");
4865  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4866  for (const CapturedStmt::Capture &C : CS->captures()) {
4867  // Capture variables captured by reference in lambdas for target-based
4868  // directives.
4869  if (!C.capturesVariable())
4870  continue;
4871  const VarDecl *VD = C.getCapturedVar();
4872  const auto *RD = VD->getType()
4873  .getCanonicalType()
4875  ->getAsCXXRecordDecl();
4876  if (!RD || !RD->isLambda())
4877  continue;
4878  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4879  LValue VDLVal;
4880  if (VD->getType().getCanonicalType()->isReferenceType())
4881  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4882  else
4883  VDLVal = CGF.MakeAddrLValue(
4884  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4885  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4886  FieldDecl *ThisCapture = nullptr;
4887  RD->getCaptureFields(Captures, ThisCapture);
4888  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4889  LValue ThisLVal =
4890  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4891  llvm::Value *CXXThis = CGF.LoadCXXThis();
4892  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4893  }
4894  for (const LambdaCapture &LC : RD->captures()) {
4895  if (LC.getCaptureKind() != LCK_ByRef)
4896  continue;
4897  const VarDecl *VD = LC.getCapturedVar();
4898  if (!CS->capturesVariable(VD))
4899  continue;
4900  auto It = Captures.find(VD);
4901  assert(It != Captures.end() && "Found lambda capture without field.");
4902  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4903  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4904  if (VD->getType().getCanonicalType()->isReferenceType())
4905  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4906  VD->getType().getCanonicalType())
4907  .getAddress(CGF);
4908  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4909  }
4910  }
4911 }
4912 
4915 }
4916 
4918  LangAS &AS) {
4919  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4920  return false;
4921  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4922  switch(A->getAllocatorType()) {
4923  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4924  // Not supported, fallback to the default mem space.
4925  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4926  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4927  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4928  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4929  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4930  AS = LangAS::Default;
4931  return true;
4932  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4933  AS = LangAS::cuda_constant;
4934  return true;
4935  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4936  AS = LangAS::cuda_shared;
4937  return true;
4938  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4939  llvm_unreachable("Expected predefined allocator for the variables with the "
4940  "static storage.");
4941  }
4942  return false;
4943 }
4944 
4945 // Get current CudaArch and ignore any unknown values
4947  if (!CGM.getTarget().hasFeature("ptx"))
4948  return CudaArch::UNKNOWN;
4949  llvm::StringMap<bool> Features;
4950  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4951  CGM.getTarget().getTargetOpts().CPU,
4952  CGM.getTarget().getTargetOpts().Features);
4953  for (const auto &Feature : Features) {
4954  if (Feature.getValue()) {
4955  CudaArch Arch = StringToCudaArch(Feature.getKey());
4956  if (Arch != CudaArch::UNKNOWN)
4957  return Arch;
4958  }
4959  }
4960  return CudaArch::UNKNOWN;
4961 }
4962 
4963 /// Check to see if target architecture supports unified addressing which is
4964 /// a restriction for OpenMP requires clause "unified_shared_memory".
4966  const OMPRequiresDecl *D) {
4967  for (const OMPClause *Clause : D->clauselists()) {
4968  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4969  CudaArch Arch = getCudaArch(CGM);
4970  switch (Arch) {
4971  case CudaArch::SM_20:
4972  case CudaArch::SM_21:
4973  case CudaArch::SM_30:
4974  case CudaArch::SM_32:
4975  case CudaArch::SM_35:
4976  case CudaArch::SM_37:
4977  case CudaArch::SM_50:
4978  case CudaArch::SM_52:
4979  case CudaArch::SM_53:
4980  case CudaArch::SM_60:
4981  case CudaArch::SM_61:
4982  case CudaArch::SM_62: {
4983  SmallString<256> Buffer;
4984  llvm::raw_svector_ostream Out(Buffer);
4985  Out << "Target architecture " << CudaArchToString(Arch)
4986  << " does not support unified addressing";
4987  CGM.Error(Clause->getBeginLoc(), Out.str());
4988  return;
4989  }
4990  case CudaArch::SM_70:
4991  case CudaArch::SM_72:
4992  case CudaArch::SM_75:
4993  case CudaArch::GFX600:
4994  case CudaArch::GFX601:
4995  case CudaArch::GFX700:
4996  case CudaArch::GFX701:
4997  case CudaArch::GFX702:
4998  case CudaArch::GFX703:
4999  case CudaArch::GFX704:
5000  case CudaArch::GFX801:
5001  case CudaArch::GFX802:
5002  case CudaArch::GFX803:
5003  case CudaArch::GFX810:
5004  case CudaArch::GFX900:
5005  case CudaArch::GFX902:
5006  case CudaArch::GFX904:
5007  case CudaArch::GFX906:
5008  case CudaArch::GFX908:
5009  case CudaArch::GFX909:
5010  case CudaArch::GFX1010:
5011  case CudaArch::GFX1011:
5012  case CudaArch::GFX1012:
5013  case CudaArch::UNKNOWN:
5014  break;
5015  case CudaArch::LAST:
5016  llvm_unreachable("Unexpected Cuda arch.");
5017  }
5018  }
5019  }
5021 }
5022 
5023 /// Get number of SMs and number of blocks per SM.
5024 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
5025  std::pair<unsigned, unsigned> Data;
5026  if (CGM.getLangOpts().OpenMPCUDANumSMs)
5027  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
5028  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
5029  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
5030  if (Data.first && Data.second)
5031  return Data;
5032  switch (getCudaArch(CGM)) {
5033  case CudaArch::SM_20:
5034  case CudaArch::SM_21:
5035  case CudaArch::SM_30:
5036  case CudaArch::SM_32:
5037  case CudaArch::SM_35:
5038  case CudaArch::SM_37:
5039  case CudaArch::SM_50:
5040  case CudaArch::SM_52:
5041  case CudaArch::SM_53:
5042  return {16, 16};
5043  case CudaArch::SM_60:
5044  case CudaArch::SM_61:
5045  case CudaArch::SM_62:
5046  return {56, 32};
5047  case CudaArch::SM_70:
5048  case CudaArch::SM_72:
5049  case CudaArch::SM_75:
5050  return {84, 32};
5051  case CudaArch::GFX600:
5052  case CudaArch::GFX601:
5053  case CudaArch::GFX700:
5054  case CudaArch::GFX701:
5055  case CudaArch::GFX702:
5056  case CudaArch::GFX703:
5057  case CudaArch::GFX704:
5058  case CudaArch::GFX801:
5059  case CudaArch::GFX802:
5060  case CudaArch::GFX803:
5061  case CudaArch::GFX810:
5062  case CudaArch::GFX900:
5063  case CudaArch::GFX902:
5064  case CudaArch::GFX904:
5065  case CudaArch::GFX906:
5066  case CudaArch::GFX908:
5067  case CudaArch::GFX909:
5068  case CudaArch::GFX1010:
5069  case CudaArch::GFX1011:
5070  case CudaArch::GFX1012:
5071  case CudaArch::UNKNOWN:
5072  break;
5073  case CudaArch::LAST:
5074  llvm_unreachable("Unexpected Cuda arch.");
5075  }
5076  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5077 }
5078 
5080  if (!GlobalizedRecords.empty()) {
5081  ASTContext &C = CGM.getContext();
5084  RecordDecl *StaticRD = C.buildImplicitRecord(
5085  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5086  StaticRD->startDefinition();
5087  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5088  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5089  SharedStaticRD->startDefinition();
5090  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5091  if (Records.Records.empty())
5092  continue;
5093  unsigned Size = 0;
5094  unsigned RecAlignment = 0;
5095  for (const RecordDecl *RD : Records.Records) {
5096  QualType RDTy = C.getRecordType(RD);
5097  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5098  RecAlignment = std::max(RecAlignment, Alignment);
5099  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5100  Size =
5101  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5102  }
5103  Size = llvm::alignTo(Size, RecAlignment);
5104  llvm::APInt ArySize(/*numBits=*/64, Size);
5105  QualType SubTy = C.getConstantArrayType(
5106  C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
5107  const bool UseSharedMemory = Size <= SharedMemorySize;
5108  auto *Field =
5109  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5110  SourceLocation(), SourceLocation(), nullptr, SubTy,
5112  /*BW=*/nullptr, /*Mutable=*/false,
5113  /*InitStyle=*/ICIS_NoInit);
5114  Field->setAccess(AS_public);
5115  if (UseSharedMemory) {
5116  SharedStaticRD->addDecl(Field);
5117  SharedRecs.push_back(&Records);
5118  } else {
5119  StaticRD->addDecl(Field);
5120  GlobalRecs.push_back(&Records);
5121  }
5122  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5123  Records.UseSharedMemory->setInitializer(
5124  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5125  }
5126  // Allocate SharedMemorySize buffer for the shared memory.
5127  // FIXME: nvlink does not handle weak linkage correctly (object with the
5128  // different size are reported as erroneous).
5129  // Restore this code as sson as nvlink is fixed.
5130  if (!SharedStaticRD->field_empty()) {
5131  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5132  QualType SubTy = C.getConstantArrayType(
5133  C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
5134  auto *Field = FieldDecl::Create(
5135  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5137  /*BW=*/nullptr, /*Mutable=*/false,
5138  /*InitStyle=*/ICIS_NoInit);
5139  Field->setAccess(AS_public);
5140  SharedStaticRD->addDecl(Field);
5141  }
5142  SharedStaticRD->completeDefinition();
5143  if (!SharedStaticRD->field_empty()) {
5144  QualType StaticTy = C.getRecordType(SharedStaticRD);
5145  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5146  auto *GV = new llvm::GlobalVariable(
5147  CGM.getModule(), LLVMStaticTy,
5148  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5149  llvm::Constant::getNullValue(LLVMStaticTy),
5150  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5151  llvm::GlobalValue::NotThreadLocal,
5153  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5154  GV, CGM.VoidPtrTy);
5155  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5156  Rec->Buffer->replaceAllUsesWith(Replacement);
5157  Rec->Buffer->eraseFromParent();
5158  }
5159  }
5160  StaticRD->completeDefinition();
5161  if (!StaticRD->field_empty()) {
5162  QualType StaticTy = C.getRecordType(StaticRD);
5163  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5164  llvm::APInt Size1(32, SMsBlockPerSM.second);
5165  QualType Arr1Ty =
5166  C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
5167  /*IndexTypeQuals=*/0);
5168  llvm::APInt Size2(32, SMsBlockPerSM.first);
5169  QualType Arr2Ty =
5170  C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
5171  /*IndexTypeQuals=*/0);
5172  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5173  // FIXME: nvlink does not handle weak linkage correctly (object with the
5174  // different size are reported as erroneous).
5175  // Restore CommonLinkage as soon as nvlink is fixed.
5176  auto *GV = new llvm::GlobalVariable(
5177  CGM.getModule(), LLVMArr2Ty,
5178  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5179  llvm::Constant::getNullValue(LLVMArr2Ty),
5180  "_openmp_static_glob_rd_$_");
5181  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5182  GV, CGM.VoidPtrTy);
5183  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5184  Rec->Buffer->replaceAllUsesWith(Replacement);
5185  Rec->Buffer->eraseFromParent();
5186  }
5187  }
5188  }
5189  if (!TeamsReductions.empty()) {
5190  ASTContext &C = CGM.getContext();
5191  RecordDecl *StaticRD = C.buildImplicitRecord(
5192  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5193  StaticRD->startDefinition();
5194  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5195  QualType RecTy = C.getRecordType(TeamReductionRec);
5196  auto *Field = FieldDecl::Create(
5197  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5199  /*BW=*/nullptr, /*Mutable=*/false,
5200  /*InitStyle=*/ICIS_NoInit);
5201  Field->setAccess(AS_public);
5202  StaticRD->addDecl(Field);
5203  }
5204  StaticRD->completeDefinition();
5205  QualType StaticTy = C.getRecordType(StaticRD);
5206  llvm::Type *LLVMReductionsBufferTy =
5207  CGM.getTypes().ConvertTypeForMem(StaticTy);
5208  // FIXME: nvlink does not handle weak linkage correctly (object with the
5209  // different size are reported as erroneous).
5210  // Restore CommonLinkage as soon as nvlink is fixed.
5211  auto *GV = new llvm::GlobalVariable(
5212  CGM.getModule(), LLVMReductionsBufferTy,
5213  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5214  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5215  "_openmp_teams_reductions_buffer_$_");
5216  KernelTeamsReductionPtr->setInitializer(
5217  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5218  CGM.VoidPtrTy));
5219  }
5221 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:653
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:216
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5593
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:219
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:40
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1555
A class which contains all the information about a particular captured value.
Definition: Decl.h:4043
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2614
CanQualType VoidPtrTy
Definition: ASTContext.h:1044
A (possibly-)qualified type.
Definition: Type.h:654
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
CudaArch
Definition: Cuda.h:36
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:325
llvm::Type * ConvertTypeForMem(QualType T)
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:48
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:505
Stmt - This represents one statement.
Definition: Stmt.h:66
void checkArchForUnifiedAddressing(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:88
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:34
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:225
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:421
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:283
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6452
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of &#39;this&#39;.
The base class of the type hierarchy.
Definition: Type.h:1450
virtual void clear()
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4434
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2345
void emitIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference...
Definition: CGExpr.cpp:4212
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling...
virtual void checkArchForUnifiedAddressing(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing...
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::FunctionCallee createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
QualType getElementType() const
Definition: Type.h:2910
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1322
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:128
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Represents a variable declaration or definition.
Definition: Decl.h:820
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:71
The "union" keyword.
Definition: Type.h:5195
const ArrayType * castAsArrayTypeUnsafe() const
A variant of castAs<> for array type which silently discards qualifiers from the outermost type...
Definition: Type.h:7076
LangAS
Defines the address space values used by the address space qualifier of QualType. ...
Definition: AddressSpaces.h:25
bool field_empty() const
Definition: Decl.h:3971
DiagnosticsEngine & getDiags() const
llvm::Value * getPointer() const
Definition: Address.h:37
llvm::Type * ConvertTypeForMem(QualType T)
ConvertTypeForMem - Convert type T into a llvm::Type.
bool tryEmitDeclareVariant(const GlobalDecl &NewGD, const GlobalDecl &OldGD, llvm::GlobalValue *OrigAddr, bool IsForDefinition) override
Tries to emit declare variant function for OldGD from NewGD.
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:56
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:244
Represents a struct/union/class.
Definition: Decl.h:3748
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
clauselist_range clauselists()
Definition: DeclOpenMP.h:390
virtual llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:168
attr_iterator attr_end() const
Definition: DeclBase.h:508
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Represents a member of a struct/union/class.
Definition: Decl.h:2729
This represents clause &#39;lastprivate&#39; in the &#39;#pragma omp ...&#39; directives.
CharUnits getAlignment() const
Definition: CGValue.h:316
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:266
unsigned getDefaultLocationReserved2Flags() const override
Returns additional flags that can be stored in reserved_2 field of the default location.
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4154
bool isReferenceType() const
Definition: Type.h:6516
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:301
__DEVICE__ int max(int __a, int __b)
Expr * getSubExpr()
Definition: Expr.h:3202
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:723
static bool hasNestedLightweightDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) lightweight runtime construct, if any.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
This is a common base class for loop directives (&#39;omp simd&#39;, &#39;omp for&#39;, &#39;omp for simd&#39; etc...
Definition: StmtOpenMP.h:420
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
OpenMPDistScheduleClauseKind
OpenMP attributes for &#39;dist_schedule&#39; clause.
Definition: OpenMPKinds.h:151
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:156
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
void emitOpenMPDeviceFunctionRedefinition(GlobalDecl OldGD, GlobalDecl NewGD, llvm::GlobalValue *GV)
Emits the definition of OldGD function with body from NewGD.
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:66
child_range children()
Definition: Stmt.cpp:224
Expr * getIterationVariable() const
Definition: StmtOpenMP.h:914
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
ModeFlagsTy
Enum for accesseing the reserved_2 field of the ident_t struct.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:40
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
This represents clause &#39;reduction&#39; in the &#39;#pragma omp ...&#39; directives.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
bool isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a worksharing directive.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1818
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value &#39;V&#39; and type &#39;type&#39;.
Definition: Expr.cpp:928
unsigned getDefaultFirstprivateAddressSpace() const override
Returns default address space for the constant firstprivates, constant address space by default...
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:1240
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:62
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:877
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
llvm::FunctionCallee CreateConvergentRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create or return a runtime function declaration with the specified type and name. ...
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4055
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4685
Unknown execution mode (orphaned directive).
bool hasAttr() const
Definition: DeclBase.h:542
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1690
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:3389
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
void setAddress(Address address)
Definition: CGValue.h:330
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of lastprivate variables from the teams distribute ...
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
This represents &#39;#pragma omp requires...&#39; directive.
Definition: DeclOpenMP.h:345
unsigned Offset
Definition: Format.cpp:1827
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:253
virtual bool initFeatureMap(llvm::StringMap< bool > &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector< std::string > &FeatureVec) const
Initialize the map with the default set of target features for the CPU this should include all legal ...
Definition: TargetInfo.cpp:402
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:619
This represents one expression.
Definition: Expr.h:108
static Address invalid()
Definition: Address.h:34
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:327
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:134
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:695
#define V(N, I)
Definition: ASTContext.h:2941
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5579
Expr * getCallee()
Definition: Expr.h:2663
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:43
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:438
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:187
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1025
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
This represents &#39;ordered&#39; clause in the &#39;#pragma omp ...&#39; directive.
QualType getType() const
Definition: Expr.h:137
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:2046
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:50
const TargetInfo & getTarget() const
ValueDecl * getDecl()
Definition: Expr.h:1247
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
const LangOptions & getLangOpts() const
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
ASTContext & getContext() const
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2088
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:40
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:219
AttrVec & getAttrs()
Definition: DeclBase.h:490
bool hasAttrs() const
Definition: DeclBase.h:484
std::string CPU
If given, the name of the target CPU to generate code for.
Definition: TargetOptions.h:36
The l-value was considered opaque, so the alignment was determined from a type.
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:232
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:141
Kind
This captures a statement into a function.
Definition: Stmt.h:3376
QualType getCanonicalType() const
Definition: Type.h:6295
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:2076
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2166
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
static bool Ret(InterpState &S, CodePtr &PC, APValue &Result)
Definition: Interp.cpp:34
CastKind getCastKind() const
Definition: Expr.h:3196
This represents &#39;schedule&#39; clause in the &#39;#pragma omp ...&#39; directive.
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:323
const Decl * getDecl() const
Definition: GlobalDecl.h:77
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:1225
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:198
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
This file defines OpenMP nodes for declarative directives.
std::vector< std::string > Features
The list of target specific features to enable or disable – this should be a list of strings startin...
Definition: TargetOptions.h:55
llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:51
CanQualType VoidTy
Definition: ASTContext.h:1016
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2739
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:24
llvm::APInt APInt
Definition: Integral.h:27
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3274
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3477
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:258
QualType getType() const
Definition: CGValue.h:264
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can&#39;t be done.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:355
CanQualType CharTy
Definition: ASTContext.h:1018
static CudaArch getCudaArch(CodeGenModule &CGM)
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2672
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2336
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6196
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1271
ArrayRef< Capture > captures() const
Definition: Decl.h:4164
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:69
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6203
llvm::FunctionCallee CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false, bool AssumeConvergent=false)
Create or return a runtime function declaration with the specified type and name. ...
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:107
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1421
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3500
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1143
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:2995
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6811
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
void addRestrict()
Definition: Type.h:273
T * getAttr() const
Definition: DeclBase.h:538
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:51
OpenMPScheduleClauseKind
OpenMP attributes for &#39;schedule&#39; clause.
Definition: OpenMPKinds.h:85
Opcode getOpcode() const
Definition: Expr.h:2071
decl_range decls()
Definition: Stmt.h:1273
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, const Expr *SizeExpr, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:31
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:473
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1524
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:250
ExecutionMode
Defines the execution mode.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
bool isLValueReferenceType() const
Definition: Type.h:6520
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false)
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1298
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g., it is an signed integer type or a vector.
Definition: Type.cpp:1958
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:493
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:150
Capturing by reference.
Definition: Lambda.h:37
virtual llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1246
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
void addAttr(Attr *A)
Definition: DeclBase.cpp:832
capture_range captures() const
Retrieve this lambda&#39;s captures.
Definition: ExprCXX.cpp:1253
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2546
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:250
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
CGCapturedStmtInfo * CapturedStmtInfo
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2469
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
CanQualType IntTy
Definition: ASTContext.h:1025
llvm::Value * RemoteLaneOffset
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:1902
capture_range captures()
Definition: Stmt.h:3511
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1171
void addAddressSpace(LangAS space)
Definition: Type.h:385
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:786
QualType getType() const
Definition: Decl.h:630
bool isOpenMPLoopDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a directive with an associated loop construct.
LValue - This represents an lvalue references.
Definition: CGValue.h:167
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:146
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
void setAccess(AccessSpecifier AS)
Definition: DeclBase.h:468
CanQualType BoolTy
Definition: ASTContext.h:1017
bool isConstant(const ASTContext &Ctx) const
Definition: Type.h:789
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2557
llvm::CallInst * EmitNounwindRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4006
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:163
const LangOptions & getLangOpts() const
Definition: ASTContext.h:724
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
No in-class initializer.
Definition: Specifiers.h:259
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
Definition: Attr.h:45
SourceLocation getLocation() const
Definition: DeclBase.h:429
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point...
Definition: Expr.cpp:2991
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1556
static bool supportsLightweightRuntime(ASTContext &Ctx, const OMPExecutableDirective &D)
Checks if the construct supports lightweight runtime.