[OpenMP] add 4 custom APIs supporting MSVC OMP codegen

This check-in adds 4 APIs to support MSVC, specifically: * 3 APIs (__kmpc_sections_init, __kmpc_next_section, __kmpc_end_sections) to support the dynamic scheduling of OMP sections. * 1 API (__kmpc_copyprivate_light, a light-weight version of __kmpc_copyrprivate) to support the OMP single copyprivate clause. Differential Revision: https://reviews.llvm.org/D128403
2026-01-20 01:58:44 +08:00 · 2022-07-05 17:20:32 -05:00
parent b97013fd60
commit 43d5c4d539
4 changed files with 284 additions and 1 deletions
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -397,6 +397,13 @@ kmpc_set_disp_num_buffers                   267
        __kmpc_end_scope                    287
 %endif

+%ifndef stub
+        __kmpc_copyprivate_light            288
+        __kmpc_sections_init                289
+        __kmpc_next_section                 290
+        __kmpc_end_sections                 291
+%endif
+
 # User API entry points that have both lower- and upper- case versions for Fortran.
 # Number for lowercase version is indicated.  Number for uppercase is obtained by adding 1000.
 # User API entry points are entry points that start with 'kmp_' or 'omp_'.
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -3890,6 +3890,11 @@ KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
 KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);

+KMP_EXPORT kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_int32 numberOfSections);
+KMP_EXPORT void __kmpc_end_sections(ident_t *loc, kmp_int32 global_tid);
+
 KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
                                     kmp_int32 schedtype, kmp_int32 *plastiter,
                                     kmp_int *plower, kmp_int *pupper,
@@ -3903,6 +3908,9 @@ KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
                                   void (*cpy_func)(void *, void *),
                                   kmp_int32 didit);

+KMP_EXPORT void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid,
+                                          void *cpy_data);
+
 extern void KMPC_SET_NUM_THREADS(int arg);
 extern void KMPC_SET_DYNAMIC(int flag);
 extern void KMPC_SET_NESTED(int flag);
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -2224,6 +2224,61 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
  }
 }

+/* --------------------------------------------------------------------------*/
+/*!
+@ingroup THREADPRIVATE
+@param loc       source location information
+@param gtid      global thread number
+@param cpy_data  pointer to the data to be saved/copied or 0
+@return          the saved pointer to the data
+
+__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
+__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
+coming from single), and returns that pointer in all calls (for single thread
+it's not needed). This version doesn't do any actual data copying. Data copying
+has to be done somewhere else, e.g. inline in the generated code. Due to this,
+this function doesn't have any barrier at the end of the function, like
+__kmpc_copyprivate does, so generated code needs barrier after copying of all
+data was done.
+*/
+void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
+  void **data_ptr;
+
+  KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
+
+  KMP_MB();
+
+  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid);
+    }
+  }
+
+  // ToDo: Optimize the following barrier
+
+  if (cpy_data)
+    *data_ptr = cpy_data;
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+/* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+  __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  return *data_ptr;
+}
+
 /* -------------------------------------------------------------------------- */

 #define INIT_LOCK __kmp_init_user_lock_with_checks
@@ -4348,7 +4403,7 @@ void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
                  omp_allocator_handle_t free_allocator) {
  return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
-                        free_allocator);
+                       free_allocator);
 }

 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -2285,6 +2285,219 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
  return status;
 }

+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@return Zero if the parallel region is not active and this thread should execute
+all sections, non-zero otherwise.
+
+Beginning of sections construct.
+There are no implicit barriers in the "sections" calls, rather the compiler
+should introduce an explicit barrier if it is required.
+
+This implementation is based on __kmp_dispatch_init, using same constructs for
+shared data (we can't have sections nested directly in omp for loop, there
+should be a parallel region in between)
+*/
+kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_shared_info_template<kmp_int32> volatile *sh;
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+  KMP_COUNT_BLOCK(OMP_SECTIONS);
+  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
+
+  if (active) {
+    // Setup sections in the same way as dynamic scheduled loops.
+    // We need one shared data: which section is to execute next.
+    // (in case parallel is not active, all sections will be executed on the
+    // same thread)
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    // reuse shared data structures from dynamic sched loops:
+    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+
+    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+
+    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+    // Note: KMP_WAIT() cannot be used there: buffer index and
+    // my_buffer_index are *always* 32-bit integers.
+    KMP_MB();
+    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+
+    th->th.th_dispatch->th_dispatch_pr_current =
+        nullptr; // sections construct doesn't need private data
+    th->th.th_dispatch->th_dispatch_sh_current =
+        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
+
+  return active;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@param numberOfSections  number of sections in the 'sections' construct
+@return unsigned [from 0 to n) - number (id) of the section to execute next on
+this thread. n (or any other number not in range) - nothing to execute on this
+thread
+*/
+
+kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
+                              kmp_int32 numberOfSections) {
+
+  KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
+
+  kmp_info_t *th = __kmp_threads[gtid];
+#ifdef KMP_DEBUG
+  kmp_team_t *team = th->th.th_team;
+#endif
+
+  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
+                  numberOfSections));
+
+  // For serialized case we should not call this function:
+  KMP_DEBUG_ASSERT(!team->t.t_serialized);
+
+  dispatch_shared_info_template<kmp_int32> volatile *sh;
+
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
+  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+      th->th.th_dispatch->th_dispatch_sh_current);
+  KMP_DEBUG_ASSERT(sh);
+
+  kmp_int32 sectionIndex = 0;
+  bool moreSectionsToExecute = true;
+
+  // Find section to execute:
+  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
+  if (sectionIndex >= numberOfSections) {
+    moreSectionsToExecute = false;
+  }
+
+  // status == 0: no more sections to execute;
+  // OMPTODO: __kmpc_end_sections could be bypassed?
+  if (!moreSectionsToExecute) {
+    kmp_int32 num_done;
+
+    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
+
+    if (num_done == th->th.th_team_nproc - 1) {
+      /* NOTE: release this buffer to be reused */
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      sh->u.s.num_done = 0;
+      sh->u.s.iteration = 0;
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      sh->buffer_index += __kmp_dispatch_num_buffers;
+      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
+                     sh->buffer_index));
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    } // if
+
+    th->th.th_dispatch->th_deo_fcn = NULL;
+    th->th.th_dispatch->th_dxo_fcn = NULL;
+    th->th.th_dispatch->th_dispatch_sh_current = NULL;
+    th->th.th_dispatch->th_dispatch_pr_current = NULL;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_dispatch) {
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      ompt_data_t instance = ompt_data_none;
+      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
+      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
+          &(team_info->parallel_data), &(task_info->task_data),
+          ompt_dispatch_section, instance);
+    }
+#endif
+    KMP_POP_PARTITIONED_TIMER();
+  }
+
+  return sectionIndex;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+
+End of "sections" construct.
+Don't need to wait here: barrier is added separately when needed.
+*/
+void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
+
+  kmp_info_t *th = __kmp_threads[gtid];
+  int active = !th->th.th_team->t.t_serialized;
+
+  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
+
+  if (!active) {
+    // In active case call finalization is done in __kmpc_next_section
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
+          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+    }
+#endif
+    KMP_POP_PARTITIONED_TIMER();
+  }
+
+  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
+}
+
 template <typename T>
 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
                                  kmp_int32 *plastiter, T *plower, T *pupper,