[OpenMP] add 4 custom APIs supporting MSVC OMP codegen

This check-in adds 4 APIs to support MSVC, specifically:

* 3 APIs (__kmpc_sections_init, __kmpc_next_section,
   __kmpc_end_sections) to support the dynamic scheduling of OMP sections.
* 1 API (__kmpc_copyprivate_light, a light-weight version of
  __kmpc_copyrprivate) to support the OMP single copyprivate clause.

Differential Revision: https://reviews.llvm.org/D128403
This commit is contained in:
Vadim Paretsky
2022-07-05 17:20:32 -05:00
committed by Jonathan Peyton
parent b97013fd60
commit 43d5c4d539
4 changed files with 284 additions and 1 deletions

View File

@@ -397,6 +397,13 @@ kmpc_set_disp_num_buffers 267
__kmpc_end_scope 287
%endif
%ifndef stub
__kmpc_copyprivate_light 288
__kmpc_sections_init 289
__kmpc_next_section 290
__kmpc_end_sections 291
%endif
# User API entry points that have both lower- and upper- case versions for Fortran.
# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.
# User API entry points are entry points that start with 'kmp_' or 'omp_'.

View File

@@ -3890,6 +3890,11 @@ KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
KMP_EXPORT kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 global_tid);
KMP_EXPORT kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 global_tid,
kmp_int32 numberOfSections);
KMP_EXPORT void __kmpc_end_sections(ident_t *loc, kmp_int32 global_tid);
KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
kmp_int32 schedtype, kmp_int32 *plastiter,
kmp_int *plower, kmp_int *pupper,
@@ -3903,6 +3908,9 @@ KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
void (*cpy_func)(void *, void *),
kmp_int32 didit);
KMP_EXPORT void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid,
void *cpy_data);
extern void KMPC_SET_NUM_THREADS(int arg);
extern void KMPC_SET_DYNAMIC(int flag);
extern void KMPC_SET_NESTED(int flag);

View File

@@ -2224,6 +2224,61 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
}
}
/* --------------------------------------------------------------------------*/
/*!
@ingroup THREADPRIVATE
@param loc source location information
@param gtid global thread number
@param cpy_data pointer to the data to be saved/copied or 0
@return the saved pointer to the data
__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
coming from single), and returns that pointer in all calls (for single thread
it's not needed). This version doesn't do any actual data copying. Data copying
has to be done somewhere else, e.g. inline in the generated code. Due to this,
this function doesn't have any barrier at the end of the function, like
__kmpc_copyprivate does, so generated code needs barrier after copying of all
data was done.
*/
void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
void **data_ptr;
KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
KMP_MB();
data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
if (__kmp_env_consistency_check) {
if (loc == 0) {
KMP_WARNING(ConstructIdentInvalid);
}
}
// ToDo: Optimize the following barrier
if (cpy_data)
*data_ptr = cpy_data;
#if OMPT_SUPPORT
ompt_frame_t *ompt_frame;
if (ompt_enabled.enabled) {
__ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
if (ompt_frame->enter_frame.ptr == NULL)
ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
OMPT_STORE_RETURN_ADDRESS(gtid);
}
#endif
/* This barrier is not a barrier region boundary */
#if USE_ITT_NOTIFY
__kmp_threads[gtid]->th.th_ident = loc;
#endif
__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
return *data_ptr;
}
/* -------------------------------------------------------------------------- */
#define INIT_LOCK __kmp_init_user_lock_with_checks
@@ -4348,7 +4403,7 @@ void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator) {
return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
free_allocator);
free_allocator);
}
void omp_free(void *ptr, omp_allocator_handle_t allocator) {

View File

@@ -2285,6 +2285,219 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
return status;
}
/*!
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
@return Zero if the parallel region is not active and this thread should execute
all sections, non-zero otherwise.
Beginning of sections construct.
There are no implicit barriers in the "sections" calls, rather the compiler
should introduce an explicit barrier if it is required.
This implementation is based on __kmp_dispatch_init, using same constructs for
shared data (we can't have sections nested directly in omp for loop, there
should be a parallel region in between)
*/
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
int active;
kmp_info_t *th;
kmp_team_t *team;
kmp_uint32 my_buffer_index;
dispatch_shared_info_template<kmp_int32> volatile *sh;
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
/* setup data */
th = __kmp_threads[gtid];
team = th->th.th_team;
active = !team->t.t_serialized;
th->th.th_ident = loc;
KMP_COUNT_BLOCK(OMP_SECTIONS);
KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
if (active) {
// Setup sections in the same way as dynamic scheduled loops.
// We need one shared data: which section is to execute next.
// (in case parallel is not active, all sections will be executed on the
// same thread)
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
my_buffer_index = th->th.th_dispatch->th_disp_index++;
// reuse shared data structures from dynamic sched loops:
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
my_buffer_index));
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
// Note: KMP_WAIT() cannot be used there: buffer index and
// my_buffer_index are *always* 32-bit integers.
KMP_MB();
KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
th->th.th_dispatch->th_dispatch_pr_current =
nullptr; // sections construct doesn't need private data
th->th.th_dispatch->th_dispatch_sh_current =
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
}
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
}
#endif
KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
return active;
}
/*!
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
@param numberOfSections number of sections in the 'sections' construct
@return unsigned [from 0 to n) - number (id) of the section to execute next on
this thread. n (or any other number not in range) - nothing to execute on this
thread
*/
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
kmp_int32 numberOfSections) {
KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
kmp_info_t *th = __kmp_threads[gtid];
#ifdef KMP_DEBUG
kmp_team_t *team = th->th.th_team;
#endif
KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
numberOfSections));
// For serialized case we should not call this function:
KMP_DEBUG_ASSERT(!team->t.t_serialized);
dispatch_shared_info_template<kmp_int32> volatile *sh;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(sh);
kmp_int32 sectionIndex = 0;
bool moreSectionsToExecute = true;
// Find section to execute:
sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
if (sectionIndex >= numberOfSections) {
moreSectionsToExecute = false;
}
// status == 0: no more sections to execute;
// OMPTODO: __kmpc_end_sections could be bypassed?
if (!moreSectionsToExecute) {
kmp_int32 num_done;
num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
if (num_done == th->th.th_team_nproc - 1) {
/* NOTE: release this buffer to be reused */
KMP_MB(); /* Flush all pending memory write invalidates. */
sh->u.s.num_done = 0;
sh->u.s.iteration = 0;
KMP_MB(); /* Flush all pending memory write invalidates. */
sh->buffer_index += __kmp_dispatch_num_buffers;
KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
sh->buffer_index));
KMP_MB(); /* Flush all pending memory write invalidates. */
} // if
th->th.th_dispatch->th_deo_fcn = NULL;
th->th.th_dispatch->th_dxo_fcn = NULL;
th->th.th_dispatch->th_dispatch_sh_current = NULL;
th->th.th_dispatch->th_dispatch_pr_current = NULL;
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_dispatch) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_data_t instance = ompt_data_none;
instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
&(team_info->parallel_data), &(task_info->task_data),
ompt_dispatch_section, instance);
}
#endif
KMP_POP_PARTITIONED_TIMER();
}
return sectionIndex;
}
/*!
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
End of "sections" construct.
Don't need to wait here: barrier is added separately when needed.
*/
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
kmp_info_t *th = __kmp_threads[gtid];
int active = !th->th.th_team->t.t_serialized;
KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
if (!active) {
// In active case call finalization is done in __kmpc_next_section
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
}
#endif
KMP_POP_PARTITIONED_TIMER();
}
KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
}
template <typename T>
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
kmp_int32 *plastiter, T *plower, T *pupper,