clang
8.0.0
|
#include "/work/www-prereleases/8.0.0/rc1/builddocs/llvm.src/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h"
Public Types | |
enum | ExecutionMode { EM_SPMD, EM_NonSPMD, EM_Unknown } |
Defines the execution mode. More... | |
enum | DataSharingMode { CUDA, Generic } |
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are actually global threadlocal, and Generic, in which the local variables are placed in global memory if they may escape their declaration context. More... | |
Public Member Functions | |
CGOpenMPRuntimeNVPTX (CodeGenModule &CGM) | |
void | clear () override |
virtual void | emitProcBindClause (CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override |
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generate code for 'proc_bind' clause. More... | |
virtual void | emitNumThreadsClause (CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override |
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads) to generate code for 'num_threads' clause. More... | |
void | emitNumTeamsClause (CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override |
This function ought to emit, in the general case, a call to. More... | |
llvm::Value * | emitParallelOutlinedFunction (const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override |
Emits inlined function for the specified OpenMP parallel. More... | |
llvm::Value * | emitTeamsOutlinedFunction (const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override |
Emits inlined function for the specified OpenMP teams. More... | |
void | emitTeamsCall (CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override |
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stored in CapturedStruct. More... | |
void | emitParallelCall (CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override |
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which address is stored in CapturedStruct. More... | |
void | emitBarrierCall (CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override |
Emit an implicit/explicit barrier for OpenMP threads. More... | |
void | emitCriticalRegion (CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override |
Emits a critical region. More... | |
virtual void | emitReduction (CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override |
Emit a code for reduction clause. More... | |
llvm::Constant * | createNVPTXRuntimeFunction (unsigned Function) |
Returns specified OpenMP runtime function for the current OpenMP implementation. More... | |
const VarDecl * | translateParameter (const FieldDecl *FD, const VarDecl *NativeParam) const override |
Translates the native parameter of outlined function if this is required for target. More... | |
Address | getParameterAddress (CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override |
Gets the address of the native argument basing on the address of the target-specific parameter. More... | |
void | emitOutlinedFunctionCall (CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override |
Emits call of the outlined function with the provided arguments, translating these arguments to correct target-specific arguments. More... | |
void | emitFunctionProlog (CodeGenFunction &CGF, const Decl *D) override |
Emits OpenMP-specific function prolog. More... | |
Address | getAddressOfLocalVariable (CodeGenFunction &CGF, const VarDecl *VD) override |
Gets the OpenMP-specific address of the local variable. More... | |
void | functionFinished (CodeGenFunction &CGF) override |
Cleans up references to the objects in finished function. More... | |
void | getDefaultDistScheduleAndChunk (CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override |
Choose a default value for the dist_schedule clause. More... | |
void | getDefaultScheduleAndChunk (CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override |
Choose a default value for the schedule clause. More... | |
void | adjustTargetSpecificDataForLambdas (CodeGenFunction &CGF, const OMPExecutableDirective &D) const override |
Adjust some parameters for the target-based directives, like addresses of the variables captured by reference in lambdas. More... | |
void | checkArchForUnifiedAddressing (CodeGenModule &CGM, const OMPRequiresDecl *D) const override |
Perform check on requires decl to ensure that target architecture supports unified addressing. More... | |
Protected Member Functions | |
StringRef | getOutlinedHelperName () const override |
Get the function name of an outlined region. More... | |
bool | isDefaultLocationConstant () const override |
Check if the default location must be constant. More... | |
unsigned | getDefaultLocationReserved2Flags () const override |
Returns additional flags that can be stored in reserved_2 field of the default location. More... | |
![]() | |
CGOpenMPRuntime (CodeGenModule &CGM, StringRef FirstSeparator, StringRef Separator) | |
Constructor allowing to redefine the name separator for the variables. More... | |
virtual void | emitTargetOutlinedFunctionHelper (const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) |
Helper to emit outlined function for 'target' directive. More... | |
void | emitOMPIfClause (CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen) |
Emits code for OpenMP 'if' clause using specified CodeGen function. More... | |
llvm::Value * | emitUpdateLocation (CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0) |
Emits object of ident_t type with info for source location. More... | |
llvm::Type * | getIdentTyPointerTy () |
Returns pointer to ident_t type. More... | |
llvm::Value * | getThreadID (CodeGenFunction &CGF, SourceLocation Loc) |
Gets thread id value for the current thread. More... | |
void | emitCall (CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *Callee, ArrayRef< llvm::Value *> Args=llvm::None) const |
Emits Callee function call with arguments Args with location Loc . More... | |
virtual Address | emitThreadIDAddress (CodeGenFunction &CGF, SourceLocation Loc) |
Emits address of the word in a memory where current thread id is stored. More... | |
void | setLocThreadIdInsertPt (CodeGenFunction &CGF, bool AtCurrentPoint=false) |
void | clearLocThreadIdInsertPt (CodeGenFunction &CGF) |
llvm::ArrayType * | getKmpCriticalNameTy () const |
Get the LLVM type for the critical name. More... | |
llvm::Value * | getCriticalRegionLock (StringRef CriticalName) |
Returns corresponding lock object for the specified critical region name. More... | |
Additional Inherited Members | |
![]() | |
static unsigned | getDefaultFlagsForBarriers (OpenMPDirectiveKind Kind) |
Returns default flags for the barriers depending on the directive, for which this barier is going to be emitted. More... | |
![]() | |
CodeGenModule & | CGM |
StringRef | FirstSeparator |
StringRef | Separator |
Definition at line 26 of file CGOpenMPRuntimeNVPTX.h.
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are actually global threadlocal, and Generic, in which the local variables are placed in global memory if they may escape their declaration context.
Enumerator | |
---|---|
CUDA | CUDA data sharing mode. |
Generic | Generic data-sharing mode. |
Definition at line 360 of file CGOpenMPRuntimeNVPTX.h.
Defines the execution mode.
Definition at line 29 of file CGOpenMPRuntimeNVPTX.h.
|
explicit |
Definition at line 1900 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenModule::getLangOpts().
Referenced by isDefaultLocationConstant().
|
override |
Adjust some parameters for the target-based directives, like addresses of the variables captured by reference in lambdas.
Definition at line 4325 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::CapturedStmtInfo, clang::CapturedStmt::captures(), clang::CapturedStmt::capturesVariable(), clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue(), clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization(), clang::CodeGen::CodeGenFunction::EmitStoreOfScalar(), clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar(), clang::Type::getAsCXXRecordDecl(), clang::QualType::getCanonicalType(), clang::OMPExecutableDirective::getCapturedStmt(), clang::OMPExecutableDirective::getDirectiveKind(), clang::QualType::getNonReferenceType(), clang::CodeGen::Address::getPointer(), clang::ValueDecl::getType(), clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured(), clang::isOpenMPTargetExecutionDirective(), clang::Type::isReferenceType(), clang::LCK_ByRef, clang::CodeGen::CodeGenFunction::LoadCXXThis(), and clang::CodeGen::CodeGenFunction::MakeAddrLValue().
|
override |
Perform check on requires decl to ensure that target architecture supports unified addressing.
Check to see if target architecture supports unified addressing which is a restriction for OpenMP requires clause "unified_shared_memory".
Definition at line 4397 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::OMPRequiresDecl::clauselists(), clang::CodeGen::CodeGenModule::Error(), getCudaArch(), clang::GFX600, clang::GFX601, clang::GFX700, clang::GFX701, clang::GFX702, clang::GFX703, clang::GFX704, clang::GFX801, clang::GFX802, clang::GFX803, clang::GFX810, clang::GFX900, clang::GFX902, clang::GFX904, clang::GFX906, clang::GFX909, clang::LAST, clang::SM_20, clang::SM_21, clang::SM_30, clang::SM_32, clang::SM_35, clang::SM_37, clang::SM_50, clang::SM_52, clang::SM_53, clang::SM_60, clang::SM_61, clang::SM_62, clang::SM_70, clang::SM_72, clang::SM_75, and clang::UNKNOWN.
|
override |
Definition at line 4497 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::DeclContext::addDecl(), clang::AS_public, clang::ASTContext::buildImplicitRecord(), clang::ASTContext::CharTy, clang::CodeGen::clear(), clang::RecordDecl::completeDefinition(), clang::CodeGen::CodeGenTypes::ConvertTypeForMem(), clang::FieldDecl::Create(), clang::cuda_shared, clang::RecordDecl::field_empty(), clang::ASTContext::getConstantArrayType(), clang::CodeGen::CodeGenModule::getContext(), clang::CodeGen::CodeGenModule::getModule(), clang::CharUnits::getQuantity(), clang::ASTContext::getRecordType(), getSMsBlocksPerSM(), clang::ASTContext::getTargetAddressSpace(), clang::ASTContext::getTrivialTypeSourceInfo(), clang::ASTContext::getTypeAlignInChars(), clang::CodeGen::CodeGenModule::getTypes(), clang::ASTContext::getTypeSizeInChars(), clang::ICIS_NoInit, clang::CodeGen::CodeGenTypeCache::Int16Ty, max(), clang::ArrayType::Normal, clang::CodeGen::CodeGenTypeCache::SizeTy, clang::TagDecl::startDefinition(), clang::TTK_Union, clang::CodeGen::Type, and clang::CodeGen::CodeGenTypeCache::VoidPtrTy.
Referenced by isDefaultLocationConstant().
llvm::Constant * CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction | ( | unsigned | Function | ) |
Returns specified OpenMP runtime function for the current OpenMP implementation.
Specialized for the NVPTX device.
Function | OpenMP runtime function. |
Build void __kmpc_kernel_prepare_parallel( void *outlined_function, int16_t IsOMPRuntimeInitialized);
Build bool __kmpc_kernel_parallel(void **outlined_function, int16_t IsOMPRuntimeInitialized);
Build void __kmpc_kernel_end_parallel();
Build void __kmpc_data_sharing_init_stack();
Build void __kmpc_data_sharing_init_stack_spmd();
Build void __kmpc_begin_sharing_variables(void ***args, size_t n_args);
Build void __kmpc_end_sharing_variables();
Build void __kmpc_get_shared_variables(void ***GlobalArgs);
Definition at line 1551 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::ASTContext::BoolTy, clang::CodeGen::CodeGenTypes::ConvertType(), clang::CodeGen::CodeGenModule::CreateRuntimeFunction(), clang::CodeGen::CodeGenModule::getContext(), clang::CodeGen::CodeGenModule::getLLVMContext(), clang::CodeGen::CodeGenModule::getModule(), clang::CodeGen::CodeGenModule::getTypes(), clang::CodeGen::CodeGenTypeCache::Int16Ty, clang::CodeGen::CodeGenTypeCache::Int32Ty, clang::CodeGen::CodeGenTypeCache::Int64Ty, clang::CodeGen::CodeGenTypeCache::Int8PtrPtrTy, clang::CodeGen::CodeGenTypeCache::Int8PtrTy, clang::CodeGen::CodeGenTypeCache::Int8Ty, clang::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(), clang::None, OMPRTL__kmpc_barrier, setPropertyExecutionMode(), clang::CodeGen::CodeGenTypeCache::SizeTy, supportsSPMDExecutionMode(), clang::CodeGen::Type, clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy, clang::CodeGen::CodeGenTypeCache::VoidPtrTy, and clang::CodeGen::CodeGenTypeCache::VoidTy.
Referenced by createRuntimeShuffleFunction(), emitBarrierCall(), emitOutlinedFunctionCall(), emitParallelCall(), and emitTeamsOutlinedFunction().
|
override |
Emit an implicit/explicit barrier for OpenMP threads.
Kind | Directive for which this implicit barrier call must be generated. Must be OMPD_barrier for explicit barrier generation. |
EmitChecks | true if need to emit checks for cancellation barriers. |
ForceSimpleCall | true simple barrier call must be emitted, false if runtime class decides which one to emit (simple or with cancellation checks). |
Definition at line 2697 of file CGOpenMPRuntimeNVPTX.cpp.
References createNVPTXRuntimeFunction(), clang::CodeGen::CodeGenFunction::EmitRuntimeCall(), clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation(), clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers(), clang::CodeGen::CGOpenMPRuntime::getThreadID(), clang::CodeGen::CodeGenFunction::HaveInsertPoint(), and OMPRTL__kmpc_barrier.
Referenced by emitCriticalRegion(), and isDefaultLocationConstant().
|
override |
Emits a critical region.
CriticalName | Name of the critical region. |
CriticalOpGen | Generator for the statement associated with the given critical region. |
Hint | Value of the 'hint' clause (optional). |
Definition at line 2711 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::Builder, clang::CodeGen::CodeGenFunction::createBasicBlock(), clang::CodeGen::CodeGenFunction::CreateMemTemp(), emitBarrierCall(), clang::CodeGen::CodeGenFunction::EmitBlock(), clang::CodeGen::CodeGenFunction::EmitBranch(), clang::CodeGen::emitCriticalRegion(), clang::CodeGen::CodeGenFunction::EmitLoadOfScalar(), clang::CodeGen::CodeGenFunction::EmitStoreOfScalar(), clang::CodeGen::CodeGenFunction::getContext(), clang::ASTContext::getIntTypeForBitwidth(), getNVPTXNumThreads(), getNVPTXThreadID(), clang::CodeGen::CodeGenTypeCache::Int32Ty, clang::CodeGen::CodeGenFunction::MakeAddrLValue(), and clang::OMPD_unknown.
Referenced by isDefaultLocationConstant().
|
override |
Emits OpenMP-specific function prolog.
Required for device constructs.
Definition at line 4189 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::CapturedStmtInfo, clang::CodeGen::CodeGenFunction::CGM, clang::CR_OpenMP, clang::CodeGen::CodeGenFunction::CurFn, clang::CodeGen::CodeGenFunction::EHStack, EM_SPMD, Generic, clang::Decl::getBeginLoc(), getDataSharingMode(), clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind(), clang::CodeGen::CodeGenModule::getOpenMPRuntime(), clang::None, and clang::CodeGen::NormalAndEHCleanup.
|
override |
This function ought to emit, in the general case, a call to.
NumTeams | An integer expression of teams. |
ThreadLimit | An integer expression of threads. |
Definition at line 1926 of file CGOpenMPRuntimeNVPTX.cpp.
Referenced by isDefaultLocationConstant().
|
overridevirtual |
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads) to generate code for 'num_threads' clause.
NumThreads | An integer value of threads. |
Definition at line 1916 of file CGOpenMPRuntimeNVPTX.cpp.
References EM_SPMD, and clang::CodeGen::emitNumThreadsClause().
Referenced by isDefaultLocationConstant().
|
override |
Emits call of the outlined function with the provided arguments, translating these arguments to correct target-specific arguments.
Definition at line 4036 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration(), clang::CodeGen::CodeGenFunction::Builder, castValueToType(), clang::CodeGen::CodeGenFunction::ConvertTypeForMem(), clang::Create(), clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP(), clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca(), clang::CodeGen::CodeGenFunction::CreateMemTemp(), createNVPTXRuntimeFunction(), clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast(), clang::CodeGen::CodeGenFunction::EmitLoadOfPointer(), clang::CodeGen::CodeGenFunction::EmitLoadOfScalar(), clang::CodeGen::emitOutlinedFunctionCall(), clang::CodeGen::CodeGenFunction::EmitRuntimeCall(), clang::CodeGen::CodeGenFunction::FinishFunction(), clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar(), clang::OMPExecutableDirective::getBeginLoc(), clang::OMPExecutableDirective::getCapturedStmt(), clang::CodeGen::CodeGenModule::getContext(), clang::CodeGen::CodeGenFunction::getContext(), clang::OMPExecutableDirective::getDirectiveKind(), clang::CodeGen::CodeGenTypes::GetFunctionType(), clang::ASTContext::getIntTypeForBitwidth(), clang::CodeGen::CodeGenModule::getModule(), clang::CodeGen::Address::getPointer(), clang::CodeGen::CodeGenTypeCache::getPointerSize(), clang::ASTContext::getPointerType(), clang::ASTContext::getSizeType(), clang::CodeGen::CodeGenModule::getTypes(), clang::ASTContext::getUIntPtrType(), clang::CodeGen::CodeGenFunction::InitTempAlloca(), clang::InternalLinkage, clang::CodeGen::Address::invalid(), clang::isOpenMPLoopBoundSharingDirective(), clang::ImplicitParamDecl::Other, clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes(), clang::CodeGen::CodeGenTypeCache::SizeTy, clang::CodeGen::CodeGenFunction::StartFunction(), clang::CodeGen::Type, clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy, clang::ASTContext::VoidPtrTy, and clang::ASTContext::VoidTy.
Referenced by emitParallelCall(), and emitTeamsCall().
|
override |
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which address is stored in CapturedStruct.
OutlinedFn | Outlined function to be run in parallel threads. Type of this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). |
CapturedVars | A pointer to the record with the references to variables used in OutlinedFn function. |
IfCond | Condition in the associated 'if' clause, if it was specified, nullptr otherwise. |
Definition at line 2448 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::Builder, clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP(), clang::CodeGen::ApplyDebugLocation::CreateEmpty(), clang::CodeGen::CodeGenFunction::CreateMemTemp(), createNVPTXRuntimeFunction(), clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast(), EM_SPMD, clang::CodeGen::CGOpenMPRuntime::emitOMPIfClause(), emitOutlinedFunctionCall(), clang::CodeGen::CodeGenFunction::EmitRuntimeCall(), clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress(), clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation(), clang::CodeGen::CodeGenFunction::getContext(), clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy(), clang::ASTContext::getIntTypeForBitwidth(), clang::CodeGen::Address::getPointer(), clang::ASTContext::getPointerType(), clang::CodeGen::CGOpenMPRuntime::getThreadID(), clang::CodeGen::CodeGenFunction::HaveInsertPoint(), clang::CodeGen::CodeGenFunction::InitTempAlloca(), clang::CodeGen::CodeGenTypeCache::Int32Ty, clang::CodeGen::CodeGenTypeCache::Int8PtrTy, clang::InternalLinkage, clang::CodeGen::RegionCodeGenTy::setAction(), clang::CodeGen::CodeGenTypeCache::SizeTy, and clang::ASTContext::VoidPtrTy.
Referenced by isDefaultLocationConstant().
|
override |
Emits inlined function for the specified OpenMP parallel.
D. This outlined function has type void(*)(kmp_int32 ThreadID, kmp_int32 BoundID, struct context_vars).
D | OpenMP directive. |
ThreadIDVar | Variable for thread id in the current OpenMP region. |
InnermostKind | Kind of innermost directive (for simple directives it is a directive itself, for combined - its innermost directive). |
CodeGen | Code generation sequence for the D directive. |
Definition at line 1931 of file CGOpenMPRuntimeNVPTX.cpp.
References EM_SPMD, clang::CodeGen::emitParallelOutlinedFunction(), and clang::CodeGen::RegionCodeGenTy::setAction().
Referenced by isDefaultLocationConstant().
|
overridevirtual |
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generate code for 'proc_bind' clause.
Definition at line 1906 of file CGOpenMPRuntimeNVPTX.cpp.
References EM_SPMD, and clang::CodeGen::emitProcBindClause().
Referenced by isDefaultLocationConstant().
|
overridevirtual |
Emit a code for reduction clause.
Design of OpenMP reductions on the GPU.
Privates | List of private copies for original reduction arguments. |
LHSExprs | List of LHS in ReductionOps reduction operations. |
RHSExprs | List of RHS in ReductionOps reduction operations. |
ReductionOps | List of reduction operations in form 'LHS binop RHS' or 'operator binop(LHS, RHS)'. |
Options | List of options for reduction codegen: WithNowait true if parent directive has also nowait clause, false otherwise. SimpleReduction Emit reduction operation only. Used for omp simd directive on the host. ReductionKind The kind of reduction to perform. |
Consider a typical OpenMP program with one or more reduction clauses:
float foo; double bar; #pragma omp target teams distribute parallel for \ reduction(+:foo) reduction(*:bar) for (int i = 0; i < N; i++) { foo += A[i]; bar *= B[i]; }
where 'foo' and 'bar' are reduced across all OpenMP threads in all teams. In our OpenMP implementation on the NVPTX device an OpenMP team is mapped to a CUDA threadblock and OpenMP threads within a team are mapped to CUDA threads within a threadblock. Our goal is to efficiently aggregate values across all OpenMP threads such that:
Introduction to Decoupling
We would like to decouple the compiler and the runtime so that the latter is ignorant of the reduction variables (number, data types) and the reduction operators. This allows a simpler interface and implementation while still attaining good performance.
Pseudocode for the aforementioned OpenMP program generated by the compiler is as follows:
Call the OpenMP runtime on the GPU to reduce within a team and store the result on the team master:
__kmpc_nvptx_parallel_reduce_nowait_v2(..., reduceData, shuffleReduceFn, interWarpCpyFn)
where: struct ReduceData { double *foo; double *bar; } reduceData reduceData.foo = &foo_private reduceData.bar = &bar_private
'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two auxiliary functions generated by the compiler that operate on variables of type 'ReduceData'. They aid the runtime perform algorithmic steps in a data agnostic manner.
'shuffleReduceFn' is a pointer to a function that reduces data of type 'ReduceData' across two OpenMP threads (lanes) in the same warp. It takes the following arguments as input:
a. variable of type 'ReduceData' on the calling lane, b. its lane_id, c. an offset relative to the current lane_id to generate a remote_lane_id. The remote lane contains the second variable of type 'ReduceData' that is to be reduced. d. an algorithm version parameter determining which reduction algorithm to use.
'shuffleReduceFn' retrieves data from the remote lane using efficient GPU shuffle intrinsics and reduces, using the algorithm specified by the 4th parameter, the two operands element-wise. The result is written to the first operand.
Different reduction algorithms are implemented in different runtime functions, all calling 'shuffleReduceFn' to perform the essential reduction step. Therefore, based on the 4th parameter, this function behaves slightly differently to cooperate with the runtime to ensure correctness under different circumstances.
'InterWarpCpyFn' is a pointer to a function that transfers reduced variables across warps. It tunnels, through CUDA shared memory, the thread-private data of type 'ReduceData' from lane 0 of each warp to a lane in the first warp.
Call the OpenMP runtime on the GPU to reduce across teams. The last team writes the global reduced value to memory.
ret = __kmpc_nvptx_teams_reduce_nowait(..., reduceData, shuffleReduceFn, interWarpCpyFn, scratchpadCopyFn, loadAndReduceFn)
'scratchpadCopyFn' is a helper that stores reduced data from the team master to a scratchpad array in global memory.
'loadAndReduceFn' is a helper that loads data from the scratchpad array and reduces it with the input operand.
These compiler generated functions hide address calculation and alignment information from the runtime.
Warp Reduction Algorithms
On the warp level, we have three algorithms implemented in the OpenMP runtime depending on the number of active lanes:
Full Warp Reduction
The reduce algorithm within a warp where all lanes are active is implemented in the runtime as follows:
full_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn) { for (int offset = WARPSIZE/2; offset > 0; offset /= 2) ShuffleReduceFn(reduce_data, 0, offset, 0); }
The algorithm completes in log(2, WARPSIZE) steps.
'ShuffleReduceFn' is used here with lane_id set to 0 because it is not used therefore we save instructions by not retrieving lane_id from the corresponding special registers. The 4th parameter, which represents the version of the algorithm being used, is set to 0 to signify full warp reduction.
In this version, 'ShuffleReduceFn' behaves, per element, as follows:
#reduce_elem refers to an element in the local lane's data structure #remote_elem is retrieved from a remote lane remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); reduce_elem = reduce_elem REDUCE_OP remote_elem;
Contiguous Partial Warp Reduction
This reduce algorithm is used within a warp where only the first 'n' (n <= WARPSIZE) lanes are active. It is typically used when the number of OpenMP threads in a parallel region is not a multiple of WARPSIZE. The algorithm is implemented in the runtime as follows:
void contiguous_partial_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, int lane_id) { int curr_size; int offset; curr_size = size; mask = curr_size/2; while (offset>0) { ShuffleReduceFn(reduce_data, lane_id, offset, 1); curr_size = (curr_size+1)/2; offset = curr_size/2; } }
In this version, 'ShuffleReduceFn' behaves, per element, as follows:
remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); if (lane_id < offset) reduce_elem = reduce_elem REDUCE_OP remote_elem else reduce_elem = remote_elem
This algorithm assumes that the data to be reduced are located in a contiguous subset of lanes starting from the first. When there is an odd number of active lanes, the data in the last lane is not aggregated with any other lane's dat but is instead copied over.
Dispersed Partial Warp Reduction
This algorithm is used within a warp when any discontiguous subset of lanes are active. It is used to implement the reduction operation across lanes in an OpenMP simd region or in a nested parallel region.
void dispersed_partial_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn) { int size, remote_id; int logical_lane_id = number_of_active_lanes_before_me() * 2; do { remote_id = next_active_lane_id_right_after_me();
size = number_of_active_lanes_in_this_warp(); logical_lane_id /= 2; ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); } while (logical_lane_id % 2 == 0 && size > 1); }
There is no assumption made about the initial state of the reduction. Any number of lanes (>=1) could be active at any position. The reduction result is returned in the first active lane.
In this version, 'ShuffleReduceFn' behaves, per element, as follows:
remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); if (lane_id % 2 == 0 && offset > 0) reduce_elem = reduce_elem REDUCE_OP remote_elem else reduce_elem = remote_elem
Intra-Team Reduction
This function, as implemented in the runtime call '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP threads in a team. It first reduces within a warp using the aforementioned algorithms. We then proceed to gather all such reduced values at the first warp.
The runtime makes use of the function 'InterWarpCpyFn', which copies data from each of the "warp master" (zeroth lane of each warp, where warp-reduced data is held) to the zeroth warp. This step reduces (in a mathematical sense) the problem of reduction across warp masters in a block to the problem of warp reduction.
Inter-Team Reduction
Once a team has reduced its data to a single value, it is stored in a global scratchpad array. Since each team has a distinct slot, this can be done without locking.
The last team to write to the scratchpad array proceeds to reduce the scratchpad array. One or more workers in the last team use the helper 'loadAndReduceDataFn' to load and reduce values from the array, i.e., the k'th worker reduces every k'th element.
Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to reduce across workers and compute a globally reduced value.
Definition at line 3813 of file CGOpenMPRuntimeNVPTX.cpp.
Referenced by isDefaultLocationConstant().
|
override |
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stored in CapturedStruct.
OutlinedFn | Outlined function to be run by team masters. Type of this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). |
CapturedVars | A pointer to the record with the references to variables used in OutlinedFn function. |
Definition at line 2429 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::Builder, clang::CodeGen::CodeGenFunction::CreateMemTemp(), emitOutlinedFunctionCall(), clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress(), clang::CodeGen::CodeGenFunction::getContext(), clang::ASTContext::getIntTypeForBitwidth(), clang::CodeGen::Address::getPointer(), clang::CodeGen::CodeGenFunction::HaveInsertPoint(), and clang::CodeGen::CodeGenFunction::InitTempAlloca().
Referenced by isDefaultLocationConstant().
|
override |
Emits inlined function for the specified OpenMP teams.
D. This outlined function has type void(*)(kmp_int32 ThreadID, kmp_int32 BoundID, struct context_vars).
D | OpenMP directive. |
ThreadIDVar | Variable for thread id in the current OpenMP region. |
InnermostKind | Kind of innermost directive (for simple directives it is a directive itself, for combined - its innermost directive). |
CodeGen | Code generation sequence for the D directive. |
Definition at line 2008 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::Builder, clang::Type::castAsArrayTypeUnsafe(), clang::CodeGen::CodeGenFunction::CGM, clang::CodeGen::CodeGenFunction::ConvertTypeForMem(), clang::CodeGen::CodeGenFunction::createBasicBlock(), clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP(), clang::CodeGen::ApplyDebugLocation::CreateEmpty(), clang::CodeGen::CodeGenFunction::CreateMemTemp(), createNVPTXRuntimeFunction(), clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast(), clang::CodeGen::CodeGenFunction::CurFn, clang::CodeGen::Decl, EM_SPMD, EM_Unknown, clang::CodeGen::CodeGenFunction::EmitBlock(), clang::CodeGen::CodeGenFunction::EmitBranch(), clang::CodeGen::CodeGenFunction::EmitCastToVoidPtr(), clang::CodeGen::CodeGenFunction::EmitLoadOfPointer(), clang::CodeGen::CodeGenFunction::EmitLoadOfScalar(), clang::CodeGen::CodeGenFunction::EmitLValueForField(), clang::CodeGen::CodeGenFunction::EmitNounwindRuntimeCall(), clang::CodeGen::CodeGenFunction::EmitRuntimeCall(), clang::CodeGen::CodeGenFunction::EmitStoreOfScalar(), clang::CodeGen::emitTeamsOutlinedFunction(), clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation(), clang::CharUnits::fromQuantity(), Generic, clang::CodeGen::LValue::getAddress(), clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar(), clang::CodeGen::Address::getAlignment(), clang::CodeGen::LValue::getAlignment(), clang::OMPExecutableDirective::getBeginLoc(), clang::CapturedStmt::getCapturedDecl(), clang::OMPExecutableDirective::getCapturedStmt(), clang::CodeGen::CodeGenModule::getContext(), getDataSharingMode(), clang::ASTContext::getDeclAlign(), getDistributeLastprivateVars(), clang::ArrayType::getElementType(), clang::ASTContext::getIntTypeForBitwidth(), clang::CodeGen::CodeGenModule::getModule(), getNVPTXLaneID(), clang::CodeGen::CodeGenModule::getOpenMPRuntime(), clang::CodeGen::Address::getPointer(), clang::CodeGen::LValue::getPointer(), clang::CodeGen::CodeGenTypeCache::getPointerAlign(), clang::ASTContext::getPointerType(), clang::CharUnits::getQuantity(), clang::ASTContext::getRecordType(), clang::CodeGen::CodeGenTypeCache::getSizeAlign(), clang::ASTContext::getSizeType(), getTeamsReductionVars(), clang::CodeGen::CGOpenMPRuntime::getThreadID(), clang::CodeGen::Address::getType(), clang::ASTContext::getTypeAlignInChars(), clang::CodeGen::CodeGenFunction::getTypeSize(), clang::ASTContext::getTypeSizeInChars(), clang::CodeGen::CodeGenFunction::HaveInsertPoint(), clang::CodeGen::CodeGenTypeCache::Int16Ty, clang::CodeGen::CodeGenTypeCache::Int8Ty, clang::InternalLinkage, clang::CodeGen::CodeGenFunction::MakeAddrLValue(), clang::CodeGen::CodeGenFunction::MakeNaturalAlignPointeeAddrLValue(), clang::None, Offset, clang::CharUnits::One(), clang::CodeGen::RegionCodeGenTy::setAction(), clang::CodeGen::LValue::setAddress(), clang::CodeGen::CodeGenTypeCache::SizeTy, clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy, and clang::ASTContext::VoidPtrTy.
Referenced by isDefaultLocationConstant().
|
override |
Cleans up references to the objects in finished function.
Definition at line 4293 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::CurFn, and clang::CodeGen::functionFinished().
|
override |
Gets the OpenMP-specific address of the local variable.
Definition at line 4267 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::Decl::attr_begin(), clang::Decl::attr_end(), clang::CodeGen::CodeGenFunction::CurFn, Generic, getCanonicalDecl(), clang::VarDecl::getCanonicalDecl(), getDataSharingMode(), clang::Decl::hasAttrs(), clang::if(), and clang::CodeGen::Address::invalid().
|
override |
Choose a default value for the dist_schedule clause.
Definition at line 4298 of file CGOpenMPRuntimeNVPTX.cpp.
|
overrideprotectedvirtual |
Returns additional flags that can be stored in reserved_2 field of the default location.
For NVPTX target contains data about SPMD/Non-SPMD execution mode + Full/Lightweight runtime mode. Used for better optimization.
Reimplemented from clang::CodeGen::CGOpenMPRuntime.
Definition at line 1885 of file CGOpenMPRuntimeNVPTX.cpp.
Referenced by isDefaultLocationConstant().
|
override |
Choose a default value for the schedule clause.
Definition at line 4313 of file CGOpenMPRuntimeNVPTX.cpp.
|
inlineoverrideprotectedvirtual |
Get the function name of an outlined region.
Reimplemented from clang::CodeGen::CGOpenMPRuntime.
Definition at line 184 of file CGOpenMPRuntimeNVPTX.h.
|
override |
Gets the address of the native argument basing on the address of the target-specific parameter.
NativeParam | Parameter itself. |
TargetParam | Corresponding target-specific parameter. |
Definition at line 4006 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::CodeGen::CodeGenFunction::Builder, clang::CodeGen::CodeGenFunction::CreateMemTemp(), clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast(), clang::CodeGen::CodeGenFunction::EmitLoadOfScalar(), clang::CodeGen::CodeGenFunction::EmitStoreOfScalar(), clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar(), clang::CodeGen::CodeGenFunction::getContext(), clang::ASTContext::getTargetAddressSpace(), clang::ValueDecl::getType(), clang::Type::isReferenceType(), and clang::QualifierCollector::strip().
|
inlineoverrideprotectedvirtual |
Check if the default location must be constant.
Constant for NVPTX for better optimization.
Reimplemented from clang::CodeGen::CGOpenMPRuntime.
Definition at line 190 of file CGOpenMPRuntimeNVPTX.h.
References clang::CodeGen::CGOpenMPRuntime::CGM, CGOpenMPRuntimeNVPTX(), clear(), emitBarrierCall(), emitCriticalRegion(), emitNumTeamsClause(), emitNumThreadsClause(), emitParallelCall(), emitParallelOutlinedFunction(), emitProcBindClause(), emitReduction(), emitTeamsCall(), emitTeamsOutlinedFunction(), getDefaultLocationReserved2Flags(), and Privates.
|
override |
Translates the native parameter of outlined function if this is required for target.
FD | Field decl from captured record for the parameter. |
NativeParam | Parameter itself. |
Definition at line 3974 of file CGOpenMPRuntimeNVPTX.cpp.
References clang::Qualifiers::addAddressSpace(), clang::Qualifiers::addRestrict(), clang::QualifierCollector::apply(), clang::ImplicitParamDecl::Create(), clang::ParmVarDecl::Create(), clang::ASTContext::getAddrSpaceQualType(), clang::Decl::getAttr(), clang::DeclaratorDecl::getBeginLoc(), clang::CodeGen::CodeGenModule::getContext(), clang::Decl::getDeclContext(), clang::NamedDecl::getIdentifier(), clang::getLangASFromTargetAS(), clang::Decl::getLocation(), clang::ASTContext::getPointerType(), clang::ValueDecl::getType(), clang::Type::isReferenceType(), clang::opencl_global, clang::ImplicitParamDecl::Other, clang::SC_None, and clang::QualifierCollector::strip().