mirror of
https://github.com/intel/llvm.git
synced 2026-02-05 13:21:04 +08:00
OpenMP 4.5: implemented support of schedule(simd:guided) and
schedule(simd:runtime) - library part. Compiler generation should use newly introduced scheduling kinds kmp_sch_guided_simd = 46, kmp_sch_runtime_simd = 47, as parameters to __kmpc_dispatch_init_* entries. Differential Revision: https://reviews.llvm.org/D31602 llvm-svn: 304724
This commit is contained in:
@@ -334,10 +334,12 @@ enum sched_type {
|
||||
#if OMP_45_ENABLED
|
||||
/* static with chunk adjustment (e.g., simd) */
|
||||
kmp_sch_static_balanced_chunked = 45,
|
||||
kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */
|
||||
kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */
|
||||
#endif
|
||||
|
||||
/* accessible only through KMP_SCHEDULE environment variable */
|
||||
kmp_sch_upper = 46, /**< upper bound for unordered values */
|
||||
kmp_sch_upper = 48, /**< upper bound for unordered values */
|
||||
|
||||
kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
|
||||
kmp_ord_static_chunked = 65,
|
||||
|
||||
@@ -681,6 +681,35 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
|
||||
schedule = kmp_sch_guided_iterative_chunked;
|
||||
KMP_WARNING(DispatchManyThreads);
|
||||
}
|
||||
if (schedule == kmp_sch_runtime_simd) {
|
||||
// compiler provides simd_width in the chunk parameter
|
||||
schedule = team->t.t_sched.r_sched_type;
|
||||
// Detail the schedule if needed (global controls are differentiated
|
||||
// appropriately)
|
||||
if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
|
||||
schedule == __kmp_static) {
|
||||
schedule = kmp_sch_static_balanced_chunked;
|
||||
} else {
|
||||
if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
|
||||
schedule = kmp_sch_guided_simd;
|
||||
}
|
||||
chunk = team->t.t_sched.chunk * chunk;
|
||||
}
|
||||
#if USE_ITT_BUILD
|
||||
cur_chunk = chunk;
|
||||
#endif
|
||||
#ifdef KMP_DEBUG
|
||||
{
|
||||
const char *buff;
|
||||
// create format specifiers before the debug output
|
||||
buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
|
||||
" chunk:%%%s\n",
|
||||
traits_t<ST>::spec);
|
||||
KD_TRACE(10, (buff, gtid, schedule, chunk));
|
||||
__kmp_str_free(&buff);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
pr->u.p.parm1 = chunk;
|
||||
}
|
||||
KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
|
||||
@@ -878,7 +907,21 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
|
||||
}
|
||||
break;
|
||||
} // case
|
||||
case kmp_sch_guided_iterative_chunked: {
|
||||
case kmp_sch_static_balanced_chunked: {
|
||||
// similar to balanced, but chunk adjusted to multiple of simd width
|
||||
T nth = th->th.th_team_nproc;
|
||||
KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
|
||||
" -> falling-through to static_greedy\n",
|
||||
gtid));
|
||||
schedule = kmp_sch_static_greedy;
|
||||
if (nth > 1)
|
||||
pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
|
||||
else
|
||||
pr->u.p.parm1 = tc;
|
||||
break;
|
||||
} // case
|
||||
case kmp_sch_guided_iterative_chunked:
|
||||
case kmp_sch_guided_simd: {
|
||||
T nproc = th->th.th_team_nproc;
|
||||
KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
|
||||
" case\n",
|
||||
@@ -1140,6 +1183,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
|
||||
break;
|
||||
case kmp_sch_guided_iterative_chunked:
|
||||
case kmp_sch_guided_analytical_chunked:
|
||||
case kmp_sch_guided_simd:
|
||||
schedtype = 2;
|
||||
break;
|
||||
default:
|
||||
@@ -1991,6 +2035,89 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
|
||||
} // case
|
||||
break;
|
||||
|
||||
case kmp_sch_guided_simd: {
|
||||
// same as iterative but curr-chunk adjusted to be multiple of given
|
||||
// chunk
|
||||
T chunk = pr->u.p.parm1;
|
||||
KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
|
||||
gtid));
|
||||
trip = pr->u.p.tc;
|
||||
// Start atomic part of calculations
|
||||
while (1) {
|
||||
ST remaining; // signed, because can be < 0
|
||||
init = sh->u.s.iteration; // shared value
|
||||
remaining = trip - init;
|
||||
if (remaining <= 0) { // AC: need to compare with 0 first
|
||||
status = 0; // nothing to do, don't try atomic op
|
||||
break;
|
||||
}
|
||||
KMP_DEBUG_ASSERT(init % chunk == 0);
|
||||
// compare with K*nproc*(chunk+1), K=2 by default
|
||||
if ((T)remaining < pr->u.p.parm2) {
|
||||
// use dynamic-style shcedule
|
||||
// atomically inrement iterations, get old value
|
||||
init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunk);
|
||||
remaining = trip - init;
|
||||
if (remaining <= 0) {
|
||||
status = 0; // all iterations got by other threads
|
||||
} else {
|
||||
// got some iterations to work on
|
||||
status = 1;
|
||||
if ((T)remaining > chunk) {
|
||||
limit = init + chunk - 1;
|
||||
} else {
|
||||
last = 1; // the last chunk
|
||||
limit = init + remaining - 1;
|
||||
} // if
|
||||
} // if
|
||||
break;
|
||||
} // if
|
||||
// divide by K*nproc
|
||||
UT span = remaining * (*(double *)&pr->u.p.parm3);
|
||||
UT rem = span % chunk;
|
||||
if (rem) // adjust so that span%chunk == 0
|
||||
span += chunk - rem;
|
||||
limit = init + span;
|
||||
if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
|
||||
(ST)limit)) {
|
||||
// CAS was successful, chunk obtained
|
||||
status = 1;
|
||||
--limit;
|
||||
break;
|
||||
} // if
|
||||
} // while
|
||||
if (status != 0) {
|
||||
start = pr->u.p.lb;
|
||||
incr = pr->u.p.st;
|
||||
if (p_st != NULL)
|
||||
*p_st = incr;
|
||||
*p_lb = start + init * incr;
|
||||
*p_ub = start + limit * incr;
|
||||
if (pr->ordered) {
|
||||
pr->u.p.ordered_lower = init;
|
||||
pr->u.p.ordered_upper = limit;
|
||||
#ifdef KMP_DEBUG
|
||||
{
|
||||
const char *buff;
|
||||
// create format specifiers before the debug output
|
||||
buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
|
||||
"ordered_lower:%%%s ordered_upper:%%%s\n",
|
||||
traits_t<UT>::spec, traits_t<UT>::spec);
|
||||
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
|
||||
pr->u.p.ordered_upper));
|
||||
__kmp_str_free(&buff);
|
||||
}
|
||||
#endif
|
||||
} // if
|
||||
} else {
|
||||
*p_lb = 0;
|
||||
*p_ub = 0;
|
||||
if (p_st != NULL)
|
||||
*p_st = 0;
|
||||
} // if
|
||||
} // case
|
||||
break;
|
||||
|
||||
case kmp_sch_guided_analytical_chunked: {
|
||||
T chunkspec = pr->u.p.parm1;
|
||||
UT chunkIdx;
|
||||
|
||||
@@ -2744,7 +2744,7 @@ void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
|
||||
__kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
|
||||
kmp_sched_lower - 2];
|
||||
}
|
||||
if (kind == kmp_sched_auto) {
|
||||
if (kind == kmp_sched_auto || chunk < 1) {
|
||||
// ignore parameter chunk for schedule auto
|
||||
thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
|
||||
} else {
|
||||
|
||||
410
openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c
Normal file
410
openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c
Normal file
@@ -0,0 +1,410 @@
|
||||
// RUN: %libomp-compile-and-run
|
||||
/*
|
||||
Test for the 'schedule(simd:guided)' clause.
|
||||
Compiler needs to generate a dynamic dispatching and pass the schedule
|
||||
value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#define delay() Sleep(1);
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#define delay() usleep(10);
|
||||
#endif
|
||||
|
||||
// uncomment for debug diagnostics:
|
||||
//#define DEBUG
|
||||
|
||||
#define SIMD_LEN 4
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL
|
||||
enum sched {
|
||||
kmp_sch_static_balanced_chunked = 45,
|
||||
kmp_sch_guided_simd = 46,
|
||||
kmp_sch_runtime_simd = 47,
|
||||
};
|
||||
typedef unsigned u32;
|
||||
typedef long long i64;
|
||||
typedef unsigned long long u64;
|
||||
typedef struct {
|
||||
int reserved_1;
|
||||
int flags;
|
||||
int reserved_2;
|
||||
int reserved_3;
|
||||
char *psource;
|
||||
} id;
|
||||
|
||||
extern int __kmpc_global_thread_num(id*);
|
||||
extern void __kmpc_barrier(id*, int gtid);
|
||||
extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
|
||||
extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
|
||||
extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
|
||||
extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
|
||||
int err = 0;
|
||||
static int volatile loop_sync = 0;
|
||||
i64 lb; // Chunk lower bound
|
||||
i64 ub; // Chunk upper bound
|
||||
i64 st; // Chunk stride
|
||||
int rc;
|
||||
int tid = omp_get_thread_num();
|
||||
int gtid = tid;
|
||||
int last;
|
||||
#if DEBUG
|
||||
printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
|
||||
(int)sizeof(i64), gtid, tid,
|
||||
(int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
|
||||
#endif
|
||||
// Don't test degenerate cases that should have been discovered by codegen
|
||||
if (loop_st == 0)
|
||||
return 0;
|
||||
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
|
||||
return 0;
|
||||
|
||||
__kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
|
||||
loop_lb, loop_ub, loop_st, loop_chunk);
|
||||
if (tid == 0) {
|
||||
// Let the master thread handle the chunks alone
|
||||
int chunk; // No of current chunk
|
||||
i64 next_lb; // Lower bound of the next chunk
|
||||
i64 last_ub; // Upper bound of the last processed chunk
|
||||
u64 cur; // Number of interations in current chunk
|
||||
u64 max; // Max allowed iterations for current chunk
|
||||
int undersized = 0;
|
||||
|
||||
chunk = 0;
|
||||
next_lb = loop_lb;
|
||||
max = (loop_ub - loop_lb) / loop_st + 1;
|
||||
// The first chunk can consume all iterations
|
||||
while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
|
||||
++ chunk;
|
||||
#if DEBUG
|
||||
printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
|
||||
#endif
|
||||
// Check if previous chunk (it is not the final chunk) is undersized
|
||||
if (undersized) {
|
||||
printf("Error with chunk %d\n", chunk);
|
||||
err++;
|
||||
}
|
||||
// Check lower and upper bounds
|
||||
if (lb != next_lb) {
|
||||
printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
|
||||
err++;
|
||||
}
|
||||
if (loop_st > 0) {
|
||||
if (!(ub <= loop_ub)) {
|
||||
printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(lb <= ub)) {
|
||||
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
|
||||
err++;
|
||||
}
|
||||
} else {
|
||||
if (!(ub >= loop_ub)) {
|
||||
printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(lb >= ub)) {
|
||||
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
// Stride should not change
|
||||
if (!(st == loop_st)) {
|
||||
printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
|
||||
err++;
|
||||
}
|
||||
cur = (ub - lb) / loop_st + 1;
|
||||
// Guided scheduling uses FP computations, so current chunk may
|
||||
// be a bit bigger (+1) than allowed maximum
|
||||
if (!(cur <= max + 1)) {
|
||||
printf("Error with iter %d, %d\n", cur, max);
|
||||
err++;
|
||||
}
|
||||
// Update maximum for the next chunk
|
||||
if (cur < max)
|
||||
max = cur;
|
||||
next_lb = ub + loop_st;
|
||||
last_ub = ub;
|
||||
undersized = (cur < loop_chunk);
|
||||
}; // while
|
||||
// Must have at least one chunk
|
||||
if (!(chunk > 0)) {
|
||||
printf("Error with chunk %d\n", chunk);
|
||||
err++;
|
||||
}
|
||||
// Must have the right last iteration index
|
||||
if (loop_st > 0) {
|
||||
if (!(last_ub <= loop_ub)) {
|
||||
printf("Error with last1 %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(last_ub + loop_st > loop_ub)) {
|
||||
printf("Error with last2 %d, %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
} else {
|
||||
if (!(last_ub >= loop_ub)) {
|
||||
printf("Error with last1 %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(last_ub + loop_st < loop_ub)) {
|
||||
printf("Error with last2 %d, %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
// Let non-master threads go
|
||||
loop_sync = 1;
|
||||
} else {
|
||||
int i;
|
||||
// Workers wait for master thread to finish, then call __kmpc_dispatch_next
|
||||
for (i = 0; i < 1000000; ++ i) {
|
||||
if (loop_sync != 0) {
|
||||
break;
|
||||
}; // if
|
||||
}; // for i
|
||||
while (loop_sync == 0) {
|
||||
delay();
|
||||
}; // while
|
||||
// At this moment we do not have any more chunks -- all the chunks already
|
||||
// processed by master thread
|
||||
rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
|
||||
if (rc) {
|
||||
printf("Error return value\n");
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
if (tid == 0) {
|
||||
loop_sync = 0; // Restore original state
|
||||
#if DEBUG
|
||||
printf("run_loop_64(): at the end\n");
|
||||
#endif
|
||||
}; // if
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
return err;
|
||||
} // run_loop
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
|
||||
int err = 0;
|
||||
static int volatile loop_sync = 0;
|
||||
int lb; // Chunk lower bound
|
||||
int ub; // Chunk upper bound
|
||||
int st; // Chunk stride
|
||||
int rc;
|
||||
int tid = omp_get_thread_num();
|
||||
int gtid = tid;
|
||||
int last;
|
||||
#if DEBUG
|
||||
printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
|
||||
(int)sizeof(int), gtid, tid,
|
||||
(int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
|
||||
#endif
|
||||
// Don't test degenerate cases that should have been discovered by codegen
|
||||
if (loop_st == 0)
|
||||
return 0;
|
||||
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
|
||||
return 0;
|
||||
|
||||
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
|
||||
loop_lb, loop_ub, loop_st, loop_chunk);
|
||||
if (tid == 0) {
|
||||
// Let the master thread handle the chunks alone
|
||||
int chunk; // No of current chunk
|
||||
int next_lb; // Lower bound of the next chunk
|
||||
int last_ub; // Upper bound of the last processed chunk
|
||||
u64 cur; // Number of interations in current chunk
|
||||
u64 max; // Max allowed iterations for current chunk
|
||||
int undersized = 0;
|
||||
|
||||
chunk = 0;
|
||||
next_lb = loop_lb;
|
||||
max = (loop_ub - loop_lb) / loop_st + 1;
|
||||
// The first chunk can consume all iterations
|
||||
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
|
||||
++ chunk;
|
||||
#if DEBUG
|
||||
printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
|
||||
#endif
|
||||
// Check if previous chunk (it is not the final chunk) is undersized
|
||||
if (undersized) {
|
||||
printf("Error with chunk %d\n", chunk);
|
||||
err++;
|
||||
}
|
||||
// Check lower and upper bounds
|
||||
if (lb != next_lb) {
|
||||
printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
|
||||
err++;
|
||||
}
|
||||
if (loop_st > 0) {
|
||||
if (!(ub <= loop_ub)) {
|
||||
printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(lb <= ub)) {
|
||||
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
|
||||
err++;
|
||||
}
|
||||
} else {
|
||||
if (!(ub >= loop_ub)) {
|
||||
printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(lb >= ub)) {
|
||||
printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
// Stride should not change
|
||||
if (!(st == loop_st)) {
|
||||
printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
|
||||
err++;
|
||||
}
|
||||
cur = (ub - lb) / loop_st + 1;
|
||||
// Guided scheduling uses FP computations, so current chunk may
|
||||
// be a bit bigger (+1) than allowed maximum
|
||||
if (!(cur <= max + 1)) {
|
||||
printf("Error with iter %d, %d\n", cur, max);
|
||||
err++;
|
||||
}
|
||||
// Update maximum for the next chunk
|
||||
if (cur < max)
|
||||
max = cur;
|
||||
next_lb = ub + loop_st;
|
||||
last_ub = ub;
|
||||
undersized = (cur < loop_chunk);
|
||||
}; // while
|
||||
// Must have at least one chunk
|
||||
if (!(chunk > 0)) {
|
||||
printf("Error with chunk %d\n", chunk);
|
||||
err++;
|
||||
}
|
||||
// Must have the right last iteration index
|
||||
if (loop_st > 0) {
|
||||
if (!(last_ub <= loop_ub)) {
|
||||
printf("Error with last1 %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(last_ub + loop_st > loop_ub)) {
|
||||
printf("Error with last2 %d, %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
} else {
|
||||
if (!(last_ub >= loop_ub)) {
|
||||
printf("Error with last1 %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
if (!(last_ub + loop_st < loop_ub)) {
|
||||
printf("Error with last2 %d, %d, %d, ch %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk);
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
// Let non-master threads go
|
||||
loop_sync = 1;
|
||||
} else {
|
||||
int i;
|
||||
// Workers wait for master thread to finish, then call __kmpc_dispatch_next
|
||||
for (i = 0; i < 1000000; ++ i) {
|
||||
if (loop_sync != 0) {
|
||||
break;
|
||||
}; // if
|
||||
}; // for i
|
||||
while (loop_sync == 0) {
|
||||
delay();
|
||||
}; // while
|
||||
// At this moment we do not have any more chunks -- all the chunks already
|
||||
// processed by the master thread
|
||||
rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
|
||||
if (rc) {
|
||||
printf("Error return value\n");
|
||||
err++;
|
||||
}
|
||||
}; // if
|
||||
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
if (tid == 0) {
|
||||
loop_sync = 0; // Restore original state
|
||||
#if DEBUG
|
||||
printf("run_loop<>(): at the end\n");
|
||||
#endif
|
||||
}; // if
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
return err;
|
||||
} // run_loop
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
int run_64(int num_th)
|
||||
{
|
||||
int err = 0;
|
||||
#pragma omp parallel num_threads(num_th)
|
||||
{
|
||||
int chunk;
|
||||
i64 st, lb, ub;
|
||||
for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
|
||||
for (st = 1; st <= 3; ++ st) {
|
||||
for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
|
||||
for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
|
||||
err += run_loop_64(lb, ub, st, chunk);
|
||||
err += run_loop_64(ub, lb, -st, chunk);
|
||||
}; // for ub
|
||||
}; // for lb
|
||||
}; // for st
|
||||
}; // for chunk
|
||||
}
|
||||
return err;
|
||||
} // run_all
|
||||
|
||||
int run_32(int num_th)
|
||||
{
|
||||
int err = 0;
|
||||
#pragma omp parallel num_threads(num_th)
|
||||
{
|
||||
int chunk, st, lb, ub;
|
||||
for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
|
||||
for (st = 1; st <= 3; ++ st) {
|
||||
for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
|
||||
for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
|
||||
err += run_loop_32(lb, ub, st, chunk);
|
||||
err += run_loop_32(ub, lb, -st, chunk);
|
||||
}; // for ub
|
||||
}; // for lb
|
||||
}; // for st
|
||||
}; // for chunk
|
||||
}
|
||||
return err;
|
||||
} // run_all
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
int main()
|
||||
{
|
||||
int n, err = 0;
|
||||
for (n = 1; n <= 4; ++ n) {
|
||||
err += run_32(n);
|
||||
err += run_64(n);
|
||||
}; // for n
|
||||
if (err)
|
||||
printf("failed with %d errors\n", err);
|
||||
else
|
||||
printf("passed\n");
|
||||
return err;
|
||||
}
|
||||
221
openmp/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
Normal file
221
openmp/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
Normal file
@@ -0,0 +1,221 @@
|
||||
// RUN: %libomp-compile-and-run
|
||||
|
||||
// The test checks schedule(simd:runtime)
|
||||
// in combination with omp_set_schedule()
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <omp.h>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#define delay() Sleep(1);
|
||||
#define seten(a,b,c) _putenv_s((a),(b))
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#define delay() usleep(10);
|
||||
#define seten(a,b,c) setenv((a),(b),(c))
|
||||
#endif
|
||||
|
||||
#define SIMD_LEN 4
|
||||
int err = 0;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL.
|
||||
enum sched {
|
||||
kmp_sch_static_balanced_chunked = 45,
|
||||
kmp_sch_guided_simd = 46,
|
||||
kmp_sch_runtime_simd = 47,
|
||||
};
|
||||
typedef unsigned u32;
|
||||
typedef long long i64;
|
||||
typedef unsigned long long u64;
|
||||
typedef struct {
|
||||
int reserved_1;
|
||||
int flags;
|
||||
int reserved_2;
|
||||
int reserved_3;
|
||||
char *psource;
|
||||
} id;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int __kmpc_global_thread_num(id*);
|
||||
void __kmpc_barrier(id*, int gtid);
|
||||
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
|
||||
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
|
||||
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
|
||||
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
void
|
||||
run_loop(
|
||||
int loop_lb, // Loop lower bound.
|
||||
int loop_ub, // Loop upper bound.
|
||||
int loop_st, // Loop stride.
|
||||
int lchunk
|
||||
) {
|
||||
static int volatile loop_sync = 0;
|
||||
int lb; // Chunk lower bound.
|
||||
int ub; // Chunk upper bound.
|
||||
int st; // Chunk stride.
|
||||
int rc;
|
||||
int tid = omp_get_thread_num();
|
||||
int gtid = __kmpc_global_thread_num(&loc);
|
||||
int last;
|
||||
int tc = (loop_ub - loop_lb) / loop_st + 1;
|
||||
int ch;
|
||||
int no_chunk = 0;
|
||||
if (lchunk == 0) {
|
||||
no_chunk = 1;
|
||||
lchunk = 1;
|
||||
}
|
||||
ch = lchunk * SIMD_LEN;
|
||||
#if _DEBUG > 1
|
||||
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
|
||||
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
|
||||
#endif
|
||||
// Don't test degenerate cases that should have been discovered by codegen.
|
||||
if (loop_st == 0)
|
||||
return;
|
||||
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
|
||||
return;
|
||||
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
|
||||
loop_lb, loop_ub, loop_st, SIMD_LEN);
|
||||
{
|
||||
// Let the master thread handle the chunks alone.
|
||||
int chunk; // No of current chunk.
|
||||
int last_ub; // Upper bound of the last processed chunk.
|
||||
u64 cur; // Number of interations in current chunk.
|
||||
u64 max; // Max allowed iterations for current chunk.
|
||||
int undersized = 0;
|
||||
last_ub = loop_ub;
|
||||
chunk = 0;
|
||||
max = (loop_ub - loop_lb) / loop_st + 1;
|
||||
// The first chunk can consume all iterations.
|
||||
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
|
||||
++ chunk;
|
||||
#if _DEBUG
|
||||
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
|
||||
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
|
||||
#endif
|
||||
// Check if previous chunk (it is not the final chunk) is undersized.
|
||||
if (undersized)
|
||||
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
|
||||
if (loop_st > 0) {
|
||||
if (!(ub <= loop_ub))
|
||||
printf("Error with ub %d, %d, ch %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb <= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(ub >= loop_ub))
|
||||
printf("Error with ub %d, %d, %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb >= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
}; // if
|
||||
// Stride should not change.
|
||||
if (!(st == loop_st))
|
||||
printf("Error with st %d, %d, ch %d, err %d\n",
|
||||
(int)st, (int)loop_st, chunk, ++err);
|
||||
cur = ( ub - lb ) / loop_st + 1;
|
||||
// Guided scheduling uses FP computations, so current chunk may
|
||||
// be a bit bigger (+1) than allowed maximum.
|
||||
if (!( cur <= max + 1))
|
||||
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
|
||||
// Update maximum for the next chunk.
|
||||
if (last) {
|
||||
if (!no_chunk && cur > ch)
|
||||
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
|
||||
(int)cur, ch, tid, ++err);
|
||||
} else {
|
||||
if (cur % ch)
|
||||
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
|
||||
chunk, (int)cur, ch, tid, ++err);
|
||||
}
|
||||
if (cur < max)
|
||||
max = cur;
|
||||
last_ub = ub;
|
||||
undersized = (cur < ch);
|
||||
#if _DEBUG > 1
|
||||
if (last)
|
||||
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
|
||||
undersized,cur,ch,tid,ub,lb,loop_st);
|
||||
#endif
|
||||
} // while
|
||||
// Must have the right last iteration index.
|
||||
if (loop_st > 0) {
|
||||
if (!(last_ub <= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st > loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(last_ub >= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st < loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} // if
|
||||
}
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
} // run_loop
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int chunk = 0;
|
||||
// static (no chunk)
|
||||
omp_set_schedule(omp_sched_static,0);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// auto (chunk should be ignorted)
|
||||
omp_set_schedule(omp_sched_auto,0);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// static,1
|
||||
chunk = 1;
|
||||
omp_set_schedule(omp_sched_static,1);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// dynamic,1
|
||||
omp_set_schedule(omp_sched_dynamic,1);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// guided,1
|
||||
omp_set_schedule(omp_sched_guided,1);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// dynamic,0 - use default chunk size 1
|
||||
omp_set_schedule(omp_sched_dynamic,0);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
// guided,0 - use default chunk size 1
|
||||
omp_set_schedule(omp_sched_guided,0);
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
|
||||
if (err) {
|
||||
printf("failed, err = %d\n", err);
|
||||
return 1;
|
||||
} else {
|
||||
printf("passed\n");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
// RUN: %libomp-compile
|
||||
// RUN: env OMP_SCHEDULE=guided %libomp-run
|
||||
// RUN: env OMP_SCHEDULE=guided,1 %libomp-run 1
|
||||
// RUN: env OMP_SCHEDULE=guided,2 %libomp-run 2
|
||||
// RUN: env OMP_SCHEDULE=dynamic %libomp-run
|
||||
// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1
|
||||
// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2
|
||||
// RUN: env OMP_SCHEDULE=auto %libomp-run
|
||||
|
||||
// The test checks schedule(simd:runtime)
|
||||
// in combination with OMP_SCHEDULE=guided[,chunk]
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <omp.h>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#define delay() Sleep(1);
|
||||
#define seten(a,b,c) _putenv_s((a),(b))
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#define delay() usleep(10);
|
||||
#define seten(a,b,c) setenv((a),(b),(c))
|
||||
#endif
|
||||
|
||||
#define UBOUND 100
|
||||
#define SIMD_LEN 4
|
||||
int err = 0;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL.
|
||||
enum sched {
|
||||
kmp_sch_static_balanced_chunked = 45,
|
||||
kmp_sch_guided_simd = 46,
|
||||
kmp_sch_runtime_simd = 47,
|
||||
};
|
||||
typedef unsigned u32;
|
||||
typedef long long i64;
|
||||
typedef unsigned long long u64;
|
||||
typedef struct {
|
||||
int reserved_1;
|
||||
int flags;
|
||||
int reserved_2;
|
||||
int reserved_3;
|
||||
char *psource;
|
||||
} id;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int __kmpc_global_thread_num(id*);
|
||||
void __kmpc_barrier(id*, int gtid);
|
||||
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
|
||||
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
|
||||
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
|
||||
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
void
|
||||
run_loop(
|
||||
int loop_lb, // Loop lower bound.
|
||||
int loop_ub, // Loop upper bound.
|
||||
int loop_st, // Loop stride.
|
||||
int lchunk
|
||||
) {
|
||||
static int volatile loop_sync = 0;
|
||||
int lb; // Chunk lower bound.
|
||||
int ub; // Chunk upper bound.
|
||||
int st; // Chunk stride.
|
||||
int rc;
|
||||
int tid = omp_get_thread_num();
|
||||
int gtid = __kmpc_global_thread_num(&loc);
|
||||
int last;
|
||||
int tc = (loop_ub - loop_lb) / loop_st + 1;
|
||||
int ch;
|
||||
int no_chunk = 0;
|
||||
if (lchunk == 0) {
|
||||
no_chunk = 1;
|
||||
lchunk = 1;
|
||||
}
|
||||
ch = lchunk * SIMD_LEN;
|
||||
#if _DEBUG > 1
|
||||
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
|
||||
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
|
||||
#endif
|
||||
// Don't test degenerate cases that should have been discovered by codegen.
|
||||
if (loop_st == 0)
|
||||
return;
|
||||
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
|
||||
return;
|
||||
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
|
||||
loop_lb, loop_ub, loop_st, SIMD_LEN);
|
||||
{
|
||||
// Let the master thread handle the chunks alone.
|
||||
int chunk; // No of current chunk.
|
||||
int last_ub; // Upper bound of the last processed chunk.
|
||||
u64 cur; // Number of interations in current chunk.
|
||||
u64 max; // Max allowed iterations for current chunk.
|
||||
int undersized = 0;
|
||||
last_ub = loop_ub;
|
||||
chunk = 0;
|
||||
max = (loop_ub - loop_lb) / loop_st + 1;
|
||||
// The first chunk can consume all iterations.
|
||||
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
|
||||
++ chunk;
|
||||
#if _DEBUG
|
||||
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
|
||||
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
|
||||
#endif
|
||||
// Check if previous chunk (it is not the final chunk) is undersized.
|
||||
if (undersized)
|
||||
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
|
||||
if (loop_st > 0) {
|
||||
if (!(ub <= loop_ub))
|
||||
printf("Error with ub %d, %d, ch %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb <= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(ub >= loop_ub))
|
||||
printf("Error with ub %d, %d, %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb >= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
}; // if
|
||||
// Stride should not change.
|
||||
if (!(st == loop_st))
|
||||
printf("Error with st %d, %d, ch %d, err %d\n",
|
||||
(int)st, (int)loop_st, chunk, ++err);
|
||||
cur = ( ub - lb ) / loop_st + 1;
|
||||
// Guided scheduling uses FP computations, so current chunk may
|
||||
// be a bit bigger (+1) than allowed maximum.
|
||||
if (!( cur <= max + 1))
|
||||
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
|
||||
// Update maximum for the next chunk.
|
||||
if (!last && cur % ch)
|
||||
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
|
||||
chunk, (int)cur, ch, tid, ++err);
|
||||
if (last && !no_chunk && cur > ch)
|
||||
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
|
||||
(int)cur, ch, tid, ++err);
|
||||
if (cur < max)
|
||||
max = cur;
|
||||
last_ub = ub;
|
||||
undersized = (cur < ch);
|
||||
#if _DEBUG > 1
|
||||
if (last)
|
||||
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
|
||||
undersized,cur,ch,tid,ub,lb,loop_st);
|
||||
#endif
|
||||
} // while
|
||||
// Must have the right last iteration index.
|
||||
if (loop_st > 0) {
|
||||
if (!(last_ub <= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st > loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(last_ub >= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st < loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} // if
|
||||
}
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
} // run_loop
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int chunk = 0;
|
||||
if (argc > 1) {
|
||||
// expect chunk size as a parameter
|
||||
chunk = atoi(argv[1]);
|
||||
}
|
||||
#pragma omp parallel //num_threads(num_th)
|
||||
run_loop(0, UBOUND, 1, chunk);
|
||||
if (err) {
|
||||
printf("failed, err = %d\n", err);
|
||||
return 1;
|
||||
} else {
|
||||
printf("passed\n");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,201 @@
|
||||
// RUN: %libomp-compile && %libomp-run
|
||||
// RUN: %libomp-run 1 && %libomp-run 2
|
||||
|
||||
// The test checks schedule(simd:runtime)
|
||||
// in combination with OMP_SCHEDULE=static[,chunk]
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <omp.h>
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#define delay() Sleep(1);
|
||||
#define seten(a,b,c) _putenv_s((a),(b))
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#define delay() usleep(10);
|
||||
#define seten(a,b,c) setenv((a),(b),(c))
|
||||
#endif
|
||||
|
||||
#define SIMD_LEN 4
|
||||
int err = 0;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL.
|
||||
enum sched {
|
||||
kmp_sch_static_balanced_chunked = 45,
|
||||
kmp_sch_guided_simd = 46,
|
||||
kmp_sch_runtime_simd = 47,
|
||||
};
|
||||
typedef unsigned u32;
|
||||
typedef long long i64;
|
||||
typedef unsigned long long u64;
|
||||
typedef struct {
|
||||
int reserved_1;
|
||||
int flags;
|
||||
int reserved_2;
|
||||
int reserved_3;
|
||||
char *psource;
|
||||
} id;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int __kmpc_global_thread_num(id*);
|
||||
void __kmpc_barrier(id*, int gtid);
|
||||
void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
|
||||
void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
|
||||
int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
|
||||
int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
void
|
||||
run_loop(
|
||||
int loop_lb, // Loop lower bound.
|
||||
int loop_ub, // Loop upper bound.
|
||||
int loop_st, // Loop stride.
|
||||
int lchunk
|
||||
) {
|
||||
static int volatile loop_sync = 0;
|
||||
int lb; // Chunk lower bound.
|
||||
int ub; // Chunk upper bound.
|
||||
int st; // Chunk stride.
|
||||
int rc;
|
||||
int tid = omp_get_thread_num();
|
||||
int gtid = __kmpc_global_thread_num(&loc);
|
||||
int last;
|
||||
int tc = (loop_ub - loop_lb) / loop_st + 1;
|
||||
int ch;
|
||||
int no_chunk = 0;
|
||||
if (lchunk == 0) {
|
||||
no_chunk = 1;
|
||||
lchunk = 1;
|
||||
}
|
||||
ch = lchunk * SIMD_LEN;
|
||||
#if _DEBUG > 1
|
||||
printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
|
||||
gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
|
||||
#endif
|
||||
// Don't test degenerate cases that should have been discovered by codegen.
|
||||
if (loop_st == 0)
|
||||
return;
|
||||
if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
|
||||
return;
|
||||
__kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
|
||||
loop_lb, loop_ub, loop_st, SIMD_LEN);
|
||||
{
|
||||
// Let the master thread handle the chunks alone.
|
||||
int chunk; // No of current chunk.
|
||||
int last_ub; // Upper bound of the last processed chunk.
|
||||
u64 cur; // Number of interations in current chunk.
|
||||
u64 max; // Max allowed iterations for current chunk.
|
||||
int undersized = 0;
|
||||
last_ub = loop_ub;
|
||||
chunk = 0;
|
||||
max = (loop_ub - loop_lb) / loop_st + 1;
|
||||
// The first chunk can consume all iterations.
|
||||
while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
|
||||
++ chunk;
|
||||
#if _DEBUG
|
||||
printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
|
||||
tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
|
||||
#endif
|
||||
// Check if previous chunk (it is not the final chunk) is undersized.
|
||||
if (undersized)
|
||||
printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
|
||||
if (loop_st > 0) {
|
||||
if (!(ub <= loop_ub))
|
||||
printf("Error with ub %d, %d, ch %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb <= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(ub >= loop_ub))
|
||||
printf("Error with ub %d, %d, %d, err %d\n",
|
||||
(int)ub, (int)loop_ub, chunk, ++err);
|
||||
if (!(lb >= ub))
|
||||
printf("Error with bounds %d, %d, %d, err %d\n",
|
||||
(int)lb, (int)ub, chunk, ++err);
|
||||
}; // if
|
||||
// Stride should not change.
|
||||
if (!(st == loop_st))
|
||||
printf("Error with st %d, %d, ch %d, err %d\n",
|
||||
(int)st, (int)loop_st, chunk, ++err);
|
||||
cur = ( ub - lb ) / loop_st + 1;
|
||||
// Guided scheduling uses FP computations, so current chunk may
|
||||
// be a bit bigger (+1) than allowed maximum.
|
||||
if (!( cur <= max + 1))
|
||||
printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
|
||||
// Update maximum for the next chunk.
|
||||
if (last) {
|
||||
if (!no_chunk && cur > ch)
|
||||
printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
|
||||
(int)cur, ch, tid, ++err);
|
||||
} else {
|
||||
if (cur % ch)
|
||||
printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
|
||||
chunk, (int)cur, ch, tid, ++err);
|
||||
}
|
||||
if (cur < max)
|
||||
max = cur;
|
||||
last_ub = ub;
|
||||
undersized = (cur < ch);
|
||||
#if _DEBUG > 1
|
||||
if (last)
|
||||
printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
|
||||
undersized,cur,ch,tid,ub,lb,loop_st);
|
||||
#endif
|
||||
} // while
|
||||
// Must have the right last iteration index.
|
||||
if (loop_st > 0) {
|
||||
if (!(last_ub <= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st > loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} else {
|
||||
if (!(last_ub >= loop_ub))
|
||||
printf("Error with last1 %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_ub, chunk, ++err);
|
||||
if (last && !(last_ub + loop_st < loop_ub))
|
||||
printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
|
||||
(int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
|
||||
} // if
|
||||
}
|
||||
__kmpc_barrier(&loc, gtid);
|
||||
} // run_loop
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int chunk = 0;
|
||||
if (argc > 1) {
|
||||
char *buf = malloc(8 + strlen(argv[1]));
|
||||
// expect chunk size as a parameter
|
||||
chunk = atoi(argv[1]);
|
||||
strcpy(buf,"static,");
|
||||
strcat(buf,argv[1]);
|
||||
seten("OMP_SCHEDULE",buf,1);
|
||||
printf("Testing schedule(simd:%s)\n", buf);
|
||||
free(buf);
|
||||
} else {
|
||||
seten("OMP_SCHEDULE","static",1);
|
||||
printf("Testing schedule(simd:static)\n");
|
||||
}
|
||||
#pragma omp parallel// num_threads(num_th)
|
||||
run_loop(0, 26, 1, chunk);
|
||||
if (err) {
|
||||
printf("failed, err = %d\n", err);
|
||||
return 1;
|
||||
} else {
|
||||
printf("passed\n");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user