re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
This commit is contained in:
Johannes Singler 2007-11-22 10:13:08 +00:00 committed by Johannes Singler
parent 7861a5ce14
commit e683ee2a20
21 changed files with 3756 additions and 3146 deletions

View File

@ -1,3 +1,27 @@
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com> 2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos. * docs/html/17_intro/C++STYLE: Fix typos.

View File

@ -63,15 +63,15 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Information local to one thread in the parallel quicksort run. */ /** @brief Information local to one thread in the parallel quicksort run. */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct QSBThreadLocal struct QSBThreadLocal
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
/** @brief Continuous part of the sequence, described by an /** @brief Continuous part of the sequence, described by an
iterator pair. */ iterator pair. */
typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece;
/** @brief Initial piece to work on. */ /** @brief Initial piece to work on. */
@ -94,29 +94,17 @@ namespace __gnu_parallel
QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
}; };
/** @brief Initialize the thread local storage. /** @brief Balanced quicksort divide step.
* @param tls Array of thread-local storages. * @param begin Begin iterator of subsequence.
* @param queue_size Size of the work-stealing queue. */ * @param end End iterator of subsequence.
template<typename RandomAccessIterator> * @param comp Comparator.
inline void * @param num_threads Number of threads that are allowed to work on
qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size) * this part.
{ * @pre @c (end-begin)>=1 */
int iam = omp_get_thread_num(); template<typename RandomAccessIterator, typename Comparator>
tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
}
/** @brief Balanced quicksort divide step.
* @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence.
* @param comp Comparator.
* @param num_threads Number of threads that are allowed to work on
* this part.
* @pre @c (end-begin)>=1 */
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type inline typename std::iterator_traits<RandomAccessIterator>::difference_type
qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, int num_threads) Comparator comp, thread_index_t num_threads)
{ {
_GLIBCXX_PARALLEL_ASSERT(num_threads > 0); _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
@ -124,18 +112,20 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp); RandomAccessIterator pivot_pos = median_of_three_iterators(
begin, begin + (end - begin) / 2, end - 1, comp);
#if defined(_GLIBCXX_ASSERTIONS) #if defined(_GLIBCXX_ASSERTIONS)
// Must be in between somewhere. // Must be in between somewhere.
difference_type n = end - begin; difference_type n = end - begin;
_GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) _GLIBCXX_PARALLEL_ASSERT(
|| (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos)) (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos))); || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
#endif #endif
// Swap pivot value to end. // Swap pivot value to end.
@ -143,10 +133,12 @@ namespace __gnu_parallel
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, returning end - begin - 1 in the worst case. // Divide, returning end - begin - 1 in the worst case.
difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads); difference_type split_pos = parallel_partition(
begin, end - 1, pred, num_threads);
// Swap back pivot to middle. // Swap back pivot to middle.
std::swap(*(begin + split_pos), *pivot_pos); std::swap(*(begin + split_pos), *pivot_pos);
@ -163,18 +155,21 @@ namespace __gnu_parallel
return split_pos; return split_pos;
} }
/** @brief Quicksort conquer step. /** @brief Quicksort conquer step.
* @param tls Array of thread-local storages. * @param tls Array of thread-local storages.
* @param begin Begin iterator of subsequence. * @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence. * @param end End iterator of subsequence.
* @param comp Comparator. * @param comp Comparator.
* @param iam Number of the thread processing this function. * @param iam Number of the thread processing this function.
* @param num_threads Number of threads that are allowed to work on this part. */ * @param num_threads
template<typename RandomAccessIterator, typename Comparator> * Number of threads that are allowed to work on this part. */
template<typename RandomAccessIterator, typename Comparator>
inline void inline void
qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls, qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
RandomAccessIterator begin, RandomAccessIterator end, RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, thread_index_t iam, thread_index_t num_threads) Comparator comp,
thread_index_t iam, thread_index_t num_threads,
bool parent_wait)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -182,14 +177,14 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
if (num_threads <= 1 || n < 2) if (num_threads <= 1 || n <= 1)
{ {
tls[iam]->initial.first = begin; tls[iam]->initial.first = begin;
tls[iam]->initial.second = end; tls[iam]->initial.second = end;
qsb_local_sort_with_helping(tls, comp, iam); qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
return; return;
} }
// Divide step. // Divide step.
@ -199,33 +194,55 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
#endif #endif
thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n)); thread_index_t num_threads_leftside =
std::max<thread_index_t>(1, std::min<thread_index_t>(
num_threads - 1, split_pos * num_threads / n));
#pragma omp atomic # pragma omp atomic
*tls[iam]->elements_leftover -= (difference_type)1; *tls[iam]->elements_leftover -= (difference_type)1;
// Conquer step. // Conquer step.
#pragma omp parallel sections num_threads(2) # pragma omp parallel num_threads(2)
{ {
#pragma omp section bool wait;
qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside); if(omp_get_num_threads() < 2)
// The pivot_pos is left in place, to ensure termination. wait = false;
#pragma omp section else
qsb_conquer(tls, begin + split_pos + 1, end, comp, wait = parent_wait;
iam + num_threads_leftside, num_threads - num_threads_leftside);
# pragma omp sections
{
# pragma omp section
{
qsb_conquer(tls, begin, begin + split_pos, comp,
iam,
num_threads_leftside,
wait);
wait = parent_wait;
}
// The pivot_pos is left in place, to ensure termination.
# pragma omp section
{
qsb_conquer(tls, begin + split_pos + 1, end, comp,
iam + num_threads_leftside,
num_threads - num_threads_leftside,
wait);
wait = parent_wait;
}
}
} }
} }
/** /**
* @brief Quicksort step doing load-balanced local sort. * @brief Quicksort step doing load-balanced local sort.
* @param tls Array of thread-local storages. * @param tls Array of thread-local storages.
* @param comp Comparator. * @param comp Comparator.
* @param iam Number of the thread processing this function. * @param iam Number of the thread processing this function.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls, qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
Comparator& comp, int iam) Comparator& comp, int iam, bool wait)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -251,151 +268,162 @@ namespace __gnu_parallel
for (;;) for (;;)
{ {
// Invariant: current must be a valid (maybe empty) range. // Invariant: current must be a valid (maybe empty) range.
RandomAccessIterator begin = current.first, end = current.second; RandomAccessIterator begin = current.first, end = current.second;
difference_type n = end - begin; difference_type n = end - begin;
if (n > base_case_n) if (n > base_case_n)
{ {
// Divide. // Divide.
RandomAccessIterator pivot_pos = begin + rng(n); RandomAccessIterator pivot_pos = begin + rng(n);
// Swap pivot_pos value to end. // Swap pivot_pos value to end.
if (pivot_pos != (end - 1)) if (pivot_pos != (end - 1))
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd
<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, leave pivot unchanged in last place. // Divide, leave pivot unchanged in last place.
RandomAccessIterator split_pos1, split_pos2; RandomAccessIterator split_pos1, split_pos2;
split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
// Left side: < pivot_pos; right side: >= pivot_pos. // Left side: < pivot_pos; right side: >= pivot_pos.
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
#endif #endif
// Swap pivot back to middle. // Swap pivot back to middle.
if (split_pos1 != pivot_pos) if (split_pos1 != pivot_pos)
std::swap(*split_pos1, *pivot_pos); std::swap(*split_pos1, *pivot_pos);
pivot_pos = split_pos1; pivot_pos = split_pos1;
// In case all elements are equal, split_pos1 == 0. // In case all elements are equal, split_pos1 == 0.
if ((split_pos1 + 1 - begin) < (n >> 7) if ((split_pos1 + 1 - begin) < (n >> 7)
|| (end - split_pos1) < (n >> 7)) || (end - split_pos1) < (n >> 7))
{ {
// Very unequal split, one part smaller than one 128th // Very unequal split, one part smaller than one 128th
// elements not strictly larger than the pivot. // elements not strictly larger than the pivot.
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); __gnu_parallel::unary_negate<__gnu_parallel::binder1st
<Comparator, value_type, value_type, bool>, value_type>
pred(__gnu_parallel::binder1st
<Comparator, value_type, value_type, bool>(
comp, *pivot_pos));
// Find other end of pivot-equal range. // Find other end of pivot-equal range.
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); split_pos2 = __gnu_sequential::partition(
} split_pos1 + 1, end, pred);
else }
{ else
// Only skip the pivot. // Only skip the pivot.
split_pos2 = split_pos1 + 1; split_pos2 = split_pos1 + 1;
}
// Elements equal to pivot are done. // Elements equal to pivot are done.
elements_done += (split_pos2 - split_pos1); elements_done += (split_pos2 - split_pos1);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
total_elements_done += (split_pos2 - split_pos1); total_elements_done += (split_pos2 - split_pos1);
#endif #endif
// Always push larger part onto stack. // Always push larger part onto stack.
if (((split_pos1 + 1) - begin) < (end - (split_pos2))) if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
{ {
// Right side larger. // Right side larger.
if ((split_pos2) != end) if ((split_pos2) != end)
tl.leftover_parts.push_front(std::make_pair(split_pos2, end)); tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
//current.first = begin; //already set anyway //current.first = begin; //already set anyway
current.second = split_pos1; current.second = split_pos1;
continue; continue;
} }
else else
{ {
// Left side larger. // Left side larger.
if (begin != split_pos1) if (begin != split_pos1)
tl.leftover_parts.push_front(std::make_pair(begin, split_pos1)); tl.leftover_parts.push_front(
std::make_pair(begin, split_pos1));
current.first = split_pos2; current.first = split_pos2;
//current.second = end; //already set anyway //current.second = end; //already set anyway
continue; continue;
} }
} }
else else
{ {
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
elements_done += n; elements_done += n;
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
total_elements_done += n; total_elements_done += n;
#endif #endif
// Prefer own stack, small pieces. // Prefer own stack, small pieces.
if (tl.leftover_parts.pop_front(current)) if (tl.leftover_parts.pop_front(current))
continue; continue;
#pragma omp atomic # pragma omp atomic
*tl.elements_leftover -= elements_done; *tl.elements_leftover -= elements_done;
elements_done = 0;
elements_done = 0;
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
double search_start = omp_get_wtime(); double search_start = omp_get_wtime();
#endif #endif
// Look for new work. // Look for new work.
bool success = false; bool successfully_stolen = false;
while (*tl.elements_leftover > 0 && !success while (wait && *tl.elements_leftover > 0 && !successfully_stolen
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
// Possible dead-lock. // Possible dead-lock.
&& (omp_get_wtime() < (search_start + 1.0)) && (omp_get_wtime() < (search_start + 1.0))
#endif #endif
) )
{ {
thread_index_t victim; thread_index_t victim;
victim = rng(num_threads); victim = rng(num_threads);
// Large pieces. // Large pieces.
success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current); successfully_stolen = (victim != iam)
if (!success) && tls[victim]->leftover_parts.pop_back(current);
yield(); if (!successfully_stolen)
yield();
#if !defined(__ICC) && !defined(__ECC) #if !defined(__ICC) && !defined(__ECC)
#pragma omp flush # pragma omp flush
#endif #endif
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
if (omp_get_wtime() >= (search_start + 1.0)) if (omp_get_wtime() >= (search_start + 1.0))
{ {
sleep(1); sleep(1);
_GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0)); _GLIBCXX_PARALLEL_ASSERT(
} omp_get_wtime() < (search_start + 1.0));
}
#endif #endif
if (!success) if (!successfully_stolen)
{ {
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
#endif #endif
return; return;
} }
} }
} }
} }
/** @brief Top-level quicksort routine. /** @brief Top-level quicksort routine.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param comp Comparator. * @param comp Comparator.
* @param n Length of the sequence to sort. * @param n Length of the sequence to sort.
* @param num_threads Number of threads that are allowed to work on * @param num_threads Number of threads that are allowed to work on
* this part. * this part.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) typename std::iterator_traits<RandomAccessIterator>
::difference_type n,
thread_index_t num_threads)
{ {
_GLIBCXX_CALL(end - begin) _GLIBCXX_CALL(end - begin)
@ -413,11 +441,11 @@ namespace __gnu_parallel
if (num_threads > n) if (num_threads > n)
num_threads = static_cast<thread_index_t>(n); num_threads = static_cast<thread_index_t>(n);
// Initialize thread local storage
tls_type** tls = new tls_type*[num_threads]; tls_type** tls = new tls_type*[num_threads];
difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
#pragma omp parallel num_threads(num_threads) for (thread_index_t t = 0; t < num_threads; ++t)
// Initialize variables per processor. tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
// There can never be more than ceil(log2(n)) ranges on the stack, because // There can never be more than ceil(log2(n)) ranges on the stack, because
// 1. Only one processor pushes onto the stack // 1. Only one processor pushes onto the stack
@ -426,22 +454,16 @@ namespace __gnu_parallel
volatile difference_type elements_leftover = n; volatile difference_type elements_leftover = n;
for (int i = 0; i < num_threads; i++) for (int i = 0; i < num_threads; i++)
{ {
tls[i]->elements_leftover = &elements_leftover; tls[i]->elements_leftover = &elements_leftover;
tls[i]->num_threads = num_threads; tls[i]->num_threads = num_threads;
tls[i]->global = std::make_pair(begin, end); tls[i]->global = std::make_pair(begin, end);
// Just in case nothing is left to assign. // Just in case nothing is left to assign.
tls[i]->initial = std::make_pair(end, end); tls[i]->initial = std::make_pair(end, end);
} }
// Initial splitting, recursively.
int old_nested = omp_get_nested();
omp_set_nested(true);
// Main recursion call. // Main recursion call.
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads); qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
omp_set_nested(old_nested);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
// All stack must be empty. // All stack must be empty.

View File

@ -49,54 +49,70 @@ namespace __gnu_parallel
// XXX remove std::duplicates from here if possible, // XXX remove std::duplicates from here if possible,
// XXX but keep minimal dependencies. // XXX but keep minimal dependencies.
/** @brief Calculates the rounded-down logarithm of @c n for base 2. /** @brief Calculates the rounded-down logarithm of @c n for base 2.
* @param n Argument. * @param n Argument.
* @return Returns 0 for argument 0. * @return Returns 0 for argument 0.
*/ */
template<typename Size> template<typename Size>
inline Size inline Size
log2(Size n) log2(Size n)
{ {
Size k; Size k;
for (k = 0; n != 1; n >>= 1) for (k = 0; n != 1; n >>= 1)
++k; ++k;
return k; return k;
} }
/** @brief Encode two integers into one __gnu_parallel::lcas_t. /** @brief Encode two integers into one __gnu_parallel::lcas_t.
* @param a First integer, to be encoded in the most-significant @c * @param a First integer, to be encoded in the most-significant @c
* lcas_t_bits/2 bits. * lcas_t_bits/2 bits.
* @param b Second integer, to be encoded in the least-significant * @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits. * @c lcas_t_bits/2 bits.
* @return __gnu_parallel::lcas_t value encoding @c a and @c b. * @return __gnu_parallel::lcas_t value encoding @c a and @c b.
* @see decode2 * @see decode2
*/ */
inline lcas_t inline lcas_t
encode2(int a, int b) //must all be non-negative, actually encode2(int a, int b) //must all be non-negative, actually
{ {
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0); return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
} }
/** @brief Decode two integers from one __gnu_parallel::lcas_t. /** @brief Decode two integers from one __gnu_parallel::lcas_t.
* @param x __gnu_parallel::lcas_t to decode integers from. * @param x __gnu_parallel::lcas_t to decode integers from.
* @param a First integer, to be decoded from the most-significant * @param a First integer, to be decoded from the most-significant
* @c lcas_t_bits/2 bits of @c x. * @c lcas_t_bits/2 bits of @c x.
* @param b Second integer, to be encoded in the least-significant * @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits of @c x. * @c lcas_t_bits/2 bits of @c x.
* @see encode2 * @see encode2
*/ */
inline void inline void
decode2(lcas_t x, int& a, int& b) decode2(lcas_t x, int& a, int& b)
{ {
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask); a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
b = (int)((x >> 0 ) & lcas_t_mask); b = (int)((x >> 0 ) & lcas_t_mask);
} }
/** @brief Constructs predicate for equality from strict weak /** @brief Equivalent to std::min. */
* ordering predicate template<typename T>
*/ const T&
// XXX comparator at the end, as per others min(const T& a, const T& b)
template<typename Comparator, typename T1, typename T2> {
return (a < b) ? a : b;
};
/** @brief Equivalent to std::max. */
template<typename T>
const T&
max(const T& a, const T& b)
{
return (a > b) ? a : b;
};
/** @brief Constructs predicate for equality from strict weak
* ordering predicate
*/
// XXX comparator at the end, as per others
template<typename Comparator, typename T1, typename T2>
class equal_from_less : public std::binary_function<T1, T2, bool> class equal_from_less : public std::binary_function<T1, T2, bool>
{ {
private: private:
@ -112,162 +128,176 @@ namespace __gnu_parallel
}; };
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */ /** @brief Similar to std::binder1st,
template<typename _Predicate, typename argument_type> * but giving the argument types explicitly. */
class unary_negate template<typename _Predicate, typename argument_type>
: public std::unary_function<argument_type, bool> class unary_negate
{ : public std::unary_function<argument_type, bool>
protected: {
_Predicate _M_pred; protected:
_Predicate _M_pred;
public: public:
explicit explicit
unary_negate(const _Predicate& __x) : _M_pred(__x) { } unary_negate(const _Predicate& __x) : _M_pred(__x) { }
bool bool
operator()(const argument_type& __x) operator()(const argument_type& __x)
{ return !_M_pred(__x); } { return !_M_pred(__x); }
}; };
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */ /** @brief Similar to std::binder1st,
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> * but giving the argument types explicitly. */
class binder1st template<
: public std::unary_function<second_argument_type, result_type> typename _Operation,
{ typename first_argument_type,
protected: typename second_argument_type,
_Operation op; typename result_type>
first_argument_type value; class binder1st
: public std::unary_function<second_argument_type, result_type>
{
protected:
_Operation op;
first_argument_type value;
public: public:
binder1st(const _Operation& __x, binder1st(const _Operation& __x,
const first_argument_type& __y) const first_argument_type& __y)
: op(__x), value(__y) { } : op(__x), value(__y) { }
result_type result_type
operator()(const second_argument_type& __x) operator()(const second_argument_type& __x)
{ return op(value, __x); } { return op(value, __x); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS // _GLIBCXX_RESOLVE_LIB_DEFECTS
// 109. Missing binders for non-const sequence elements // 109. Missing binders for non-const sequence elements
result_type result_type
operator()(second_argument_type& __x) const operator()(second_argument_type& __x) const
{ return op(value, __x); } { return op(value, __x); }
}; };
/** /**
* @brief Similar to std::binder2nd, but giving the argument types * @brief Similar to std::binder2nd, but giving the argument types
* explicitly. * explicitly.
*/ */
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> template<
class binder2nd typename _Operation,
: public std::unary_function<first_argument_type, result_type> typename first_argument_type,
{ typename second_argument_type,
protected: typename result_type>
_Operation op; class binder2nd
second_argument_type value; : public std::unary_function<first_argument_type, result_type>
{
protected:
_Operation op;
second_argument_type value;
public: public:
binder2nd(const _Operation& __x, binder2nd(const _Operation& __x,
const second_argument_type& __y) const second_argument_type& __y)
: op(__x), value(__y) { } : op(__x), value(__y) { }
result_type result_type
operator()(const first_argument_type& __x) const operator()(const first_argument_type& __x) const
{ return op(__x, value); } { return op(__x, value); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS // _GLIBCXX_RESOLVE_LIB_DEFECTS
// 109. Missing binders for non-const sequence elements // 109. Missing binders for non-const sequence elements
result_type result_type
operator()(first_argument_type& __x) operator()(first_argument_type& __x)
{ return op(__x, value); } { return op(__x, value); }
}; };
/** @brief Similar to std::equal_to, but allows two different types. */ /** @brief Similar to std::equal_to, but allows two different types. */
template<typename T1, typename T2> template<typename T1, typename T2>
struct equal_to : std::binary_function<T1, T2, bool> struct equal_to : std::binary_function<T1, T2, bool>
{ {
bool operator()(const T1& t1, const T2& t2) const bool operator()(const T1& t1, const T2& t2) const
{ return t1 == t2; } { return t1 == t2; }
}; };
/** @brief Similar to std::less, but allows two different types. */ /** @brief Similar to std::less, but allows two different types. */
template<typename T1, typename T2> template<typename T1, typename T2>
struct less : std::binary_function<T1, T2, bool> struct less : std::binary_function<T1, T2, bool>
{ {
bool bool
operator()(const T1& t1, const T2& t2) const operator()(const T1& t1, const T2& t2) const
{ return t1 < t2; } { return t1 < t2; }
bool bool
operator()(const T2& t2, const T1& t1) const operator()(const T2& t2, const T1& t1) const
{ return t2 < t1; } { return t2 < t1; }
}; };
// Partial specialization for one type. Same as std::less. // Partial specialization for one type. Same as std::less.
template<typename _Tp> template<typename _Tp>
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool> struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
{ {
bool bool
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x < __y; } { return __x < __y; }
}; };
/** @brief Similar to std::plus, but allows two different types. */ /** @brief Similar to std::plus, but allows two different types. */
template<typename _Tp1, typename _Tp2> template<typename _Tp1, typename _Tp2>
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1> struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
{ {
typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result; typedef typeof(*static_cast<_Tp1*>(NULL)
+ *static_cast<_Tp2*>(NULL)) result;
result result
operator()(const _Tp1& __x, const _Tp2& __y) const operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x + __y; } { return __x + __y; }
}; };
// Partial specialization for one type. Same as std::plus. // Partial specialization for one type. Same as std::plus.
template<typename _Tp> template<typename _Tp>
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{ {
typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result; typedef typeof(*static_cast<_Tp*>(NULL)
+ *static_cast<_Tp*>(NULL)) result;
result result
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x + __y; } { return __x + __y; }
}; };
/** @brief Similar to std::multiplies, but allows two different types. */ /** @brief Similar to std::multiplies, but allows two different types. */
template<typename _Tp1, typename _Tp2> template<typename _Tp1, typename _Tp2>
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1> struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
{ {
typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result; typedef typeof(*static_cast<_Tp1*>(NULL)
* *static_cast<_Tp2*>(NULL)) result;
result result
operator()(const _Tp1& __x, const _Tp2& __y) const operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x * __y; } { return __x * __y; }
}; };
// Partial specialization for one type. Same as std::multiplies. // Partial specialization for one type. Same as std::multiplies.
template<typename _Tp> template<typename _Tp>
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{ {
typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result; typedef typeof(*static_cast<_Tp*>(NULL)
* *static_cast<_Tp*>(NULL)) result;
result result
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x * __y; } { return __x * __y; }
}; };
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence; class pseudo_sequence;
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence. /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
* If features the usual random-access iterator functionality. * If features the usual random-access iterator functionality.
* @param T Sequence value type. * @param T Sequence value type.
* @param difference_type Sequence difference type. * @param difference_type Sequence difference type.
*/ */
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence_iterator class pseudo_sequence_iterator
{ {
public: public:
@ -296,34 +326,34 @@ namespace __gnu_parallel
operator++(int) operator++(int)
{ return type(pos++); } { return type(pos++); }
const T& const T&
operator*() const operator*() const
{ return val; } { return val; }
const T& const T&
operator[](difference_type) const operator[](difference_type) const
{ return val; } { return val; }
bool bool
operator==(const type& i2) operator==(const type& i2)
{ return pos == i2.pos; } { return pos == i2.pos; }
difference_type difference_type
operator!=(const type& i2) operator!=(const type& i2)
{ return pos != i2.pos; } { return pos != i2.pos; }
difference_type difference_type
operator-(const type& i2) operator-(const type& i2)
{ return pos - i2.pos; } { return pos - i2.pos; }
}; };
/** @brief Sequence that conceptually consists of multiple copies of /** @brief Sequence that conceptually consists of multiple copies of
the same element. the same element.
* The copies are not stored explicitly, of course. * The copies are not stored explicitly, of course.
* @param T Sequence value type. * @param T Sequence value type.
* @param difference_type Sequence difference type. * @param difference_type Sequence difference type.
*/ */
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence class pseudo_sequence
{ {
typedef pseudo_sequence<T, _DifferenceTp> type; typedef pseudo_sequence<T, _DifferenceTp> type;
@ -335,10 +365,10 @@ namespace __gnu_parallel
typedef pseudo_sequence_iterator<T, uint64> iterator; typedef pseudo_sequence_iterator<T, uint64> iterator;
/** @brief Constructor. /** @brief Constructor.
* @param val Element of the sequence. * @param val Element of the sequence.
* @param count Number of (virtual) copies. * @param count Number of (virtual) copies.
*/ */
pseudo_sequence(const T& val, difference_type count) pseudo_sequence(const T& val, difference_type count)
: val(val), count(count) { } : val(val), count(count) { }
/** @brief Begin iterator. */ /** @brief Begin iterator. */
@ -356,67 +386,66 @@ namespace __gnu_parallel
difference_type count; difference_type count;
}; };
/** @brief Functor that does nothing */ /** @brief Functor that does nothing */
template<typename _ValueTp> template<typename _ValueTp>
class void_functor class void_functor
{ {
inline void inline void
operator()(const _ValueTp& v) const { } operator()(const _ValueTp& v) const { }
}; };
/** @brief Compute the median of three referenced elements, /** @brief Compute the median of three referenced elements,
according to @c comp. according to @c comp.
* @param a First iterator. * @param a First iterator.
* @param b Second iterator. * @param b Second iterator.
* @param c Third iterator. * @param c Third iterator.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
RandomAccessIterator RandomAccessIterator
median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
RandomAccessIterator c, Comparator& comp) RandomAccessIterator c, Comparator& comp)
{ {
if (comp(*a, *b)) if (comp(*a, *b))
if (comp(*b, *c)) if (comp(*b, *c))
return b; return b;
else else
if (comp(*a, *c)) if (comp(*a, *c))
return c; return c;
else else
return a; return a;
else else
{ {
// Just swap a and b. // Just swap a and b.
if (comp(*a, *c)) if (comp(*a, *c))
return a; return a;
else else
if (comp(*b, *c)) if (comp(*b, *c))
return c; return c;
else else
return b; return b;
} }
} }
// Avoid the use of assert, because we're trying to keep the <cassert> // Avoid the use of assert, because we're trying to keep the <cassert>
// include out of the mix. (Same as debug mode). // include out of the mix. (Same as debug mode).
inline void inline void
__replacement_assert(const char* __file, int __line, __replacement_assert(const char* __file, int __line,
const char* __function, const char* __condition) const char* __function, const char* __condition)
{ {
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line, std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
__function, __condition); __function, __condition);
__builtin_abort(); __builtin_abort();
} }
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \ #define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
do \ do \
{ \ { \
if (!(_Condition)) \ if (!(_Condition)) \
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \ __gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
__PRETTY_FUNCTION__, #_Condition); \ __PRETTY_FUNCTION__, #_Condition); \
} while (false) } while (false)
} //namespace __gnu_parallel } //namespace __gnu_parallel
#endif #endif

View File

@ -39,7 +39,7 @@
#include <cstdio> #include <cstdio>
/** @brief Determine verbosity level of the parallel mode. /** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */ * Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0 #define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL /** @def _GLIBCXX_CALL
@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n) #define _GLIBCXX_CALL(n)
#endif #endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1) #if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads()); #define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif #endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping /** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */ * random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0 #define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */ * Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0 #define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back /** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1). * (0); or copy it back after everything is done (1).
* *
* Recommendation: 0 */ * Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif

View File

@ -39,30 +39,58 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Function to split a sequence into parts of almost equal size. /** @brief Function to split a sequence into parts of almost equal size.
* *
* The resulting sequence s of length p+1 contains the splitting * The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost * positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one * equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts. * n. There may result empty parts.
* @param n Number of elements * @param n Number of elements
* @param p Number of parts * @param num_threads Number of parts
* @param s Splitters * @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */ * @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename _DifferenceTp, typename OutputIterator> template<typename difference_type, typename OutputIterator>
OutputIterator OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s) equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{ {
typedef _DifferenceTp difference_type; difference_type chunk_length = n / num_threads,
difference_type chunk_length = n / p, split = n % p, start = 0; num_longer_chunks = n % num_threads,
for (int i = 0; i < p; i++) pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{ {
*s++ = start; *s++ = pos;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length; pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
} }
*s++ = n; *s++ = n;
return s; return s;
} }
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
} }
#endif #endif

View File

@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree, * @brief Include guarded (sequences may run empty) loser tree,
* moving objects. * moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */ * @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0 #define _GLIBCXX_LOSER_TREE 1
#endif #endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT

View File

@ -10,7 +10,7 @@
// This library is distributed in the hope that it will be useful, but // This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of // WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE. See the GNU
// General Public License for more details. // General Public License for more details.
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General Public License
@ -48,50 +48,66 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** /**
* @brief Parallel std::find, switch for different algorithms. * @brief Parallel std::find, switch for different algorithms.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Must have same * @param begin2 Begin iterator of second sequence. Must have same
* length as first sequence. * length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector) RandomAccessIterator2 begin2, Pred pred, Selector selector)
{ {
switch (Settings::find_distribution) switch (Settings::find_distribution)
{ {
case Settings::GROWING_BLOCKS: case Settings::GROWING_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag()); return find_template(begin1, end1, begin2, pred, selector,
growing_blocks_tag());
case Settings::CONSTANT_SIZE_BLOCKS: case Settings::CONSTANT_SIZE_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag()); return find_template(begin1, end1, begin2, pred, selector,
constant_size_blocks_tag());
case Settings::EQUAL_SPLIT: case Settings::EQUAL_SPLIT:
return find_template(begin1, end1, begin2, pred, selector, equal_split_tag()); return find_template(begin1, end1, begin2, pred, selector,
equal_split_tag());
default: default:
_GLIBCXX_PARALLEL_ASSERT(false); _GLIBCXX_PARALLEL_ASSERT(false);
return std::make_pair(begin1, begin2); return std::make_pair(begin1, begin2);
} }
} }
#if _GLIBCXX_FIND_EQUAL_SPLIT #if _GLIBCXX_FIND_EQUAL_SPLIT
/** /**
* @brief Parallel std::find, equal splitting variant. * @brief Parallel std::find, equal splitting variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag) find_template(RandomAccessIterator1 begin1,
RandomAccessIterator1 end1,
RandomAccessIterator2 begin2,
Pred pred,
Selector selector,
equal_split_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
@ -100,79 +116,89 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type result = length; difference_type result = length;
difference_type* borders;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1))); thread_index_t num_threads = get_max_threads();
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 1];
equally_split(length, num_threads, borders);
} //single
equally_split(length, num_threads, borders); thread_index_t iam = omp_get_thread_num();
difference_type start = borders[iam], stop = borders[iam + 1];
#pragma omp parallel shared(result) num_threads(num_threads) RandomAccessIterator1 i1 = begin1 + start;
{ RandomAccessIterator2 i2 = begin2 + start;
int iam = omp_get_thread_num(); for (difference_type pos = start; pos < stop; ++pos)
difference_type pos = borders[iam], limit = borders[iam + 1]; {
#pragma omp flush(result)
RandomAccessIterator1 i1 = begin1 + pos; // Result has been set to something lower.
RandomAccessIterator2 i2 = begin2 + pos; if (result < pos)
for (; pos < limit; pos++)
{
#pragma omp flush(result)
// Result has been set to something lower.
if (result < pos)
break;
if (selector(i1, i2, pred))
{
omp_set_lock(&result_lock);
if (result > pos)
result = pos;
omp_unset_lock(&result_lock);
break; break;
}
i1++; if (selector(i1, i2, pred))
i2++; {
} omp_set_lock(&result_lock);
} if (pos < result)
result = pos;
omp_unset_lock(&result_lock);
break;
}
++i1;
++i2;
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); delete[] borders;
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
#if _GLIBCXX_FIND_GROWING_BLOCKS #if _GLIBCXX_FIND_GROWING_BLOCKS
/** /**
* @brief Parallel std::find, growing block size variant. * @brief Parallel std::find, growing block size variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
* @see __gnu_parallel::Settings::find_sequential_search_size * @see __gnu_parallel::Settings::find_sequential_search_size
* @see __gnu_parallel::Settings::find_initial_block_size * @see __gnu_parallel::Settings::find_initial_block_size
* @see __gnu_parallel::Settings::find_maximum_block_size * @see __gnu_parallel::Settings::find_maximum_block_size
* @see __gnu_parallel::Settings::find_increasing_factor * @see __gnu_parallel::Settings::find_increasing_factor
* *
* There are two main differences between the growing blocks and * There are two main differences between the growing blocks and
* the constant-size blocks variants. * the constant-size blocks variants.
* 1. For GB, the block size grows; for CSB, the block size is fixed. * 1. For GB, the block size grows; for CSB, the block size is fixed.
* 2. For GB, the blocks are allocated dynamically; * 2. For GB, the blocks are allocated dynamically;
* for CSB, the blocks are allocated in a predetermined manner, * for CSB, the blocks are allocated in a predetermined manner,
* namely spacial round-robin. * namely spacial round-robin.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector, RandomAccessIterator2 begin2, Pred pred, Selector selector,
growing_blocks_tag) growing_blocks_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
@ -182,101 +208,118 @@ namespace __gnu_parallel
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first. // Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); selector.sequential_algorithm(
begin1, begin1 + sequential_search_size, begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size)) if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result; return find_seq_result;
// Index of beginning of next free block (after sequential find). // Index of beginning of next free block (after sequential find).
difference_type next_block_pos = sequential_search_size; difference_type next_block_start = sequential_search_size;
difference_type result = length; difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
#pragma omp parallel shared(result) num_threads(num_threads) thread_index_t num_threads = get_max_threads();
{ # pragma omp parallel shared(result) num_threads(num_threads)
// Not within first k elements -> start parallel. {
thread_index_t iam = omp_get_thread_num(); # pragma omp single
num_threads = omp_get_num_threads();
difference_type block_size = Settings::find_initial_block_size; // Not within first k elements -> start parallel.
difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size); thread_index_t iam = omp_get_thread_num();
// Get new block, update pointer to next block. difference_type block_size = Settings::find_initial_block_size;
difference_type stop = std::min<difference_type>(length, start + block_size); difference_type start =
fetch_and_add<difference_type>(&next_block_start, block_size);
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; // Get new block, update pointer to next block.
difference_type stop =
std::min<difference_type>(length, start + block_size);
while (start < length) std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
{
#pragma omp flush(result)
// Get new value of result.
if (result < start)
{
// No chance to find first element.
break;
}
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); while (start < length)
if (local_result.first != (begin1 + stop)) {
{ # pragma omp flush(result)
omp_set_lock(&result_lock); // Get new value of result.
if ((local_result.first - begin1) < result) if (result < start)
{ {
result = local_result.first - begin1; // No chance to find first element.
break;
}
// Result cannot be in future blocks, stop algorithm. local_result = selector.sequential_algorithm(
fetch_and_add<difference_type>(&next_block_pos, length); begin1 + start, begin1 + stop, begin2 + start, pred);
} if (local_result.first != (begin1 + stop))
omp_unset_lock(&result_lock); {
} omp_set_lock(&result_lock);
if ((local_result.first - begin1) < result)
{
result = local_result.first - begin1;
block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size); // Result cannot be in future blocks, stop algorithm.
fetch_and_add<difference_type>(&next_block_start, length);
}
omp_unset_lock(&result_lock);
}
// Get new block, update pointer to next block. block_size = std::min<difference_type>(
start = fetch_and_add<difference_type>(&next_block_pos, block_size); block_size * Settings::find_increasing_factor,
stop = (length < (start + block_size)) ? length : (start + block_size); Settings::find_maximum_block_size);
}
} // Get new block, update pointer to next block.
start =
fetch_and_add<difference_type>(&next_block_start, block_size);
stop = (length < (start + block_size)) ?
length : (start + block_size);
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
// Return iterator on found element. // Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
#if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
/** /**
* @brief Parallel std::find, constant block size variant. * @brief Parallel std::find, constant block size variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
* @see __gnu_parallel::Settings::find_sequential_search_size * @see __gnu_parallel::Settings::find_sequential_search_size
* @see __gnu_parallel::Settings::find_block_size * @see __gnu_parallel::Settings::find_block_size
* There are two main differences between the growing blocks and the * There are two main differences between the growing blocks and the
* constant-size blocks variants. * constant-size blocks variants.
* 1. For GB, the block size grows; for CSB, the block size is fixed. * 1. For GB, the block size grows; for CSB, the block size is fixed.
* 2. For GB, the blocks are allocated dynamically; for CSB, the * 2. For GB, the blocks are allocated dynamically; for CSB, the
* blocks are allocated in a predetermined manner, namely spacial * blocks are allocated in a predetermined manner, namely spacial
* round-robin. * round-robin.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector, RandomAccessIterator2 begin2, Pred pred, Selector selector,
constant_size_blocks_tag) constant_size_blocks_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
typedef std::iterator_traits<RandomAccessIterator1> traits_type; typedef std::iterator_traits<RandomAccessIterator1> traits_type;
@ -285,72 +328,77 @@ namespace __gnu_parallel
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first. // Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size)) if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result; return find_seq_result;
difference_type result = length; difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
// Not within first sequential_search_size elements -> start parallel. // Not within first sequential_search_size elements -> start parallel.
#pragma omp parallel shared(result) num_threads(num_threads)
{
thread_index_t iam = omp_get_thread_num();
difference_type block_size = Settings::find_initial_block_size;
difference_type start, stop; thread_index_t num_threads = get_max_threads();
# pragma omp parallel shared(result) num_threads(num_threads)
{
# pragma omp single
num_threads = omp_get_num_threads();
// First element of thread's current iteration. thread_index_t iam = omp_get_thread_num();
difference_type iteration_start = sequential_search_size; difference_type block_size = Settings::find_initial_block_size;
// Where to work (initialization). // First element of thread's current iteration.
start = iteration_start + iam * block_size; difference_type iteration_start = sequential_search_size;
stop = std::min<difference_type>(length, start + block_size);
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; // Where to work (initialization).
difference_type start = iteration_start + iam * block_size;
difference_type stop =
std::min<difference_type>(length, start + block_size);
while (start < length) std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
{
// Get new value of result.
#pragma omp flush(result)
// No chance to find first element.
if (result < start)
break;
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); while (start < length)
if (local_result.first != (begin1 + stop)) {
{ // Get new value of result.
omp_set_lock(&result_lock); # pragma omp flush(result)
if ((local_result.first - begin1) < result) // No chance to find first element.
result = local_result.first - begin1; if (result < start)
omp_unset_lock(&result_lock); break;
// Will not find better value in its interval. local_result = selector.sequential_algorithm(
break; begin1 + start, begin1 + stop,
} begin2 + start, pred);
if (local_result.first != (begin1 + stop))
{
omp_set_lock(&result_lock);
if ((local_result.first - begin1) < result)
result = local_result.first - begin1;
omp_unset_lock(&result_lock);
// Will not find better value in its interval.
break;
}
iteration_start += num_threads * block_size; iteration_start += num_threads * block_size;
// Where to work. // Where to work.
start = iteration_start + iam * block_size; start = iteration_start + iam * block_size;
stop = std::min<difference_type>(length, start + block_size); stop = std::min<difference_type>(length, start + block_size);
} }
} } //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
// Return iterator on found element. // Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
} // end namespace } // end namespace
#endif #endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -48,8 +48,8 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Subsequence description. */ /** @brief Subsequence description. */
template<typename _DifferenceTp> template<typename _DifferenceTp>
struct Piece struct Piece
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
@ -61,16 +61,19 @@ namespace __gnu_parallel
difference_type end; difference_type end;
}; };
/** @brief Data accessed by all threads. /** @brief Data accessed by all threads.
* *
* PMWMS = parallel multiway mergesort */ * PMWMS = parallel multiway mergesort */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct PMWMSSortingData struct PMWMSSortingData
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
/** @brief Number of threads involved. */
thread_index_t num_threads;
/** @brief Input begin. */ /** @brief Input begin. */
RandomAccessIterator source; RandomAccessIterator source;
@ -105,62 +108,55 @@ namespace __gnu_parallel
/** @brief Pieces of data to merge @c [thread][sequence] */ /** @brief Pieces of data to merge @c [thread][sequence] */
std::vector<Piece<difference_type> >* pieces; std::vector<Piece<difference_type> >* pieces;
};
/** @brief Thread local data for PMWMS. */
template<typename RandomAccessIterator>
struct PMWMSSorterPU
{
/** @brief Total number of thread involved. */
thread_index_t num_threads;
/** @brief Number of owning thread. */
thread_index_t iam;
/** @brief Stable sorting desired. */ /** @brief Stable sorting desired. */
bool stable; bool stable;
/** @brief Pointer to global data. */ };
PMWMSSortingData<RandomAccessIterator>* sd;
};
/** /**
* @brief Select samples from a sequence. * @brief Select samples from a sequence.
* @param d Pointer to thread-local data. Result will be placed in * @param sd Pointer to algorithm data. Result will be placed in
* @c d->ds->samples. * @c sd->samples.
* @param num_samples Number of samples to select. * @param num_samples Number of samples to select.
*/ */
template<typename RandomAccessIterator, typename _DifferenceTp> template<typename RandomAccessIterator, typename _DifferenceTp>
inline void inline void
determine_samples(PMWMSSorterPU<RandomAccessIterator>* d, determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
_DifferenceTp& num_samples) _DifferenceTp& num_samples)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd; thread_index_t iam = omp_get_thread_num();
num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1; num_samples =
Settings::sort_mwms_oversampling * sd->num_threads - 1;
difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2))); difference_type* es = new difference_type[num_samples + 2];
equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es); equally_split(sd->starts[iam + 1] - sd->starts[iam],
num_samples + 1, es);
for (difference_type i = 0; i < num_samples; i++) for (difference_type i = 0; i < num_samples; i++)
sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]]; sd->samples[iam * num_samples + i] =
sd->source[sd->starts[iam] + es[i + 1]];
delete[] es;
} }
/** @brief PMWMS code executed by each thread. /** @brief PMWMS code executed by each thread.
* @param d Pointer to thread-local data. * @param sd Pointer to algorithm data.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d, parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
Comparator& comp) Comparator& comp)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd; thread_index_t iam = omp_get_thread_num();
thread_index_t iam = d->iam;
// Length of this thread's chunk, before merging. // Length of this thread's chunk, before merging.
difference_type length_local = sd->starts[iam + 1] - sd->starts[iam]; difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
@ -174,161 +170,168 @@ namespace __gnu_parallel
typedef value_type* SortingPlacesIterator; typedef value_type* SortingPlacesIterator;
// Sort in temporary storage, leave space for sentinel. // Sort in temporary storage, leave space for sentinel.
sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1))); sd->sorting_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * (length_local + 1)));
// Copy there. // Copy there.
std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]); std::uninitialized_copy(sd->source + sd->starts[iam],
sd->source + sd->starts[iam] + length_local,
sd->sorting_places[iam]);
#endif #endif
// Sort locally. // Sort locally.
if (d->stable) if (sd->stable)
__gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); __gnu_sequential::stable_sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
comp);
else else
__gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); __gnu_sequential::sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
#if _GLIBCXX_ASSERTIONS comp);
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
#endif
// Invariant: locally sorted subsequence in sd->sorting_places[iam], // Invariant: locally sorted subsequence in sd->sorting_places[iam],
// sd->sorting_places[iam] + length_local. // sd->sorting_places[iam] + length_local.
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
{ {
difference_type num_samples; difference_type num_samples;
determine_samples(d, num_samples); determine_samples(sd, num_samples);
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
__gnu_sequential::sort(sd->samples, __gnu_sequential::sort(sd->samples,
sd->samples + (num_samples * d->num_threads), sd->samples + (num_samples * sd->num_threads),
comp); comp);
#pragma omp barrier # pragma omp barrier
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
// For each sequence. // For each sequence.
if (num_samples * iam > 0) if (num_samples * iam > 0)
sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s], sd->pieces[iam][s].begin =
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], std::lower_bound(sd->sorting_places[s],
sd->samples[num_samples * iam], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
comp) sd->samples[num_samples * iam],
- sd->sorting_places[s]; comp)
else - sd->sorting_places[s];
// Absolute beginning. else
sd->pieces[iam][s].begin = 0; // Absolute beginning.
sd->pieces[iam][s].begin = 0;
if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp)
- sd->sorting_places[s];
else
// Absolute end.
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
}
if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
sd->pieces[iam][s].end =
std::lower_bound(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
sd->samples[num_samples * (iam + 1)], comp)
- sd->sorting_places[s];
else
// Absolute end.
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
}
} }
else if (Settings::sort_splitting == Settings::EXACT) else if (Settings::sort_splitting == Settings::EXACT)
{ {
#pragma omp barrier # pragma omp barrier
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
for (int s = 0; s < d->num_threads; s++) seqs(sd->num_threads);
seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]); for (int s = 0; s < sd->num_threads; s++)
seqs[s] = std::make_pair(sd->sorting_places[s],
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
std::vector<SortingPlacesIterator> offsets(d->num_threads); std::vector<SortingPlacesIterator> offsets(sd->num_threads);
// If not last thread. // if not last thread
if (iam < d->num_threads - 1) if (iam < sd->num_threads - 1)
multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp); multiseq_partition(seqs.begin(), seqs.end(),
sd->starts[iam + 1], offsets.begin(), comp);
for (int seq = 0; seq < d->num_threads; seq++) for (int seq = 0; seq < sd->num_threads; seq++)
{ {
// For each sequence. // for each sequence
if (iam < (d->num_threads - 1)) if (iam < (sd->num_threads - 1))
sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first; sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
else else
// Absolute end of this sequence. // very end of this sequence
sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq]; sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
} }
#pragma omp barrier # pragma omp barrier
for (int seq = 0; seq < d->num_threads; seq++) for (int seq = 0; seq < sd->num_threads; seq++)
{ {
// For each sequence. // For each sequence.
if (iam > 0) if (iam > 0)
sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end; sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
else else
// Absolute beginning. // Absolute beginning.
sd->pieces[iam][seq].begin = 0; sd->pieces[iam][seq].begin = 0;
} }
} }
// Offset from target begin, length after merging. // Offset from target begin, length after merging.
difference_type offset = 0, length_am = 0; difference_type offset = 0, length_am = 0;
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin; length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
offset += sd->pieces[iam][s].begin; offset += sd->pieces[iam][s].begin;
} }
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
// Merge to temporary storage, uninitialized creation not possible // Merge to temporary storage, uninitialized creation not possible
// since there is no multiway_merge calling the placement new // since there is no multiway_merge calling the placement new
// instead of the assignment operator. // instead of the assignment operator.
sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am)); sd->merging_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * length_am));
#else #else
// Merge directly to target. // Merge directly to target.
sd->merging_places[iam] = sd->source + offset; sd->merging_places[iam] = sd->source + offset;
#endif #endif
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
seqs(sd->num_threads);
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end); seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
sd->sorting_places[s] + sd->pieces[iam][s].end);
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
#endif
} }
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag()); multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
#if _GLIBCXX_ASSERTIONS # pragma omp barrier
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
#endif
# pragma omp barrier
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
// Write back. // Write back.
std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am, std::copy(sd->merging_places[iam],
sd->source + offset); sd->merging_places[iam] + length_am,
sd->source + offset);
#endif #endif
delete[] sd->temporaries[iam]; delete[] sd->temporaries[iam];
} }
/** @brief PMWMS main call. /** @brief PMWMS main call.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param comp Comparator. * @param comp Comparator.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @param stable Stable sorting. * @param stable Stable sorting.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads, bool stable) int num_threads,
bool stable)
{ {
_GLIBCXX_CALL(n) _GLIBCXX_CALL(n)
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
@ -336,75 +339,75 @@ namespace __gnu_parallel
if (n <= 1) if (n <= 1)
return; return;
// At least one element per thread. // at least one element per thread
if (num_threads > n) if (num_threads > n)
num_threads = static_cast<thread_index_t>(n); num_threads = static_cast<thread_index_t>(n);
// shared variables
PMWMSSortingData<RandomAccessIterator> sd; PMWMSSortingData<RandomAccessIterator> sd;
difference_type* starts;
sd.source = begin; # pragma omp parallel num_threads(num_threads)
sd.temporaries = new value_type*[num_threads]; {
num_threads = omp_get_num_threads(); //no more threads than requested
# pragma omp single
{
sd.num_threads = num_threads;
sd.source = begin;
sd.temporaries = new value_type*[num_threads];
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
sd.sorting_places = new RandomAccessIterator[num_threads]; sd.sorting_places = new RandomAccessIterator[num_threads];
sd.merging_places = new value_type*[num_threads]; sd.merging_places = new value_type*[num_threads];
#else #else
sd.sorting_places = new value_type*[num_threads]; sd.sorting_places = new value_type*[num_threads];
sd.merging_places = new RandomAccessIterator[num_threads]; sd.merging_places = new RandomAccessIterator[num_threads];
#endif #endif
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
{ {
unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1; unsigned int size =
sz *= num_threads; (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
sd.samples = static_cast<value_type*>(
// Equivalent to value_type[sz], without need of default construction. ::operator new(size * sizeof(value_type)));
sz *= sizeof(value_type); }
sd.samples = static_cast<value_type*>(::operator new(sz)); else
} sd.samples = NULL;
else
sd.samples = NULL;
sd.offsets = new difference_type[num_threads - 1]; sd.offsets = new difference_type[num_threads - 1];
sd.pieces = new std::vector<Piece<difference_type> >[num_threads]; sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++) for (int s = 0; s < num_threads; s++)
sd.pieces[s].resize(num_threads); sd.pieces[s].resize(num_threads);
PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads]; starts = sd.starts = new difference_type[num_threads + 1];
difference_type* starts = sd.starts = new difference_type[num_threads + 1]; sd.stable = stable;
difference_type chunk_length = n / num_threads; difference_type chunk_length = n / num_threads;
difference_type split = n % num_threads; difference_type split = n % num_threads;
difference_type start = 0; difference_type pos = 0;
for (int i = 0; i < num_threads; i++) for (int i = 0; i < num_threads; i++)
{ {
starts[i] = start; starts[i] = pos;
start += (i < split) ? (chunk_length + 1) : chunk_length; pos += (i < split) ? (chunk_length + 1) : chunk_length;
pus[i].num_threads = num_threads; }
pus[i].iam = i; starts[num_threads] = pos;
pus[i].sd = &sd; }
pus[i].stable = stable;
}
starts[num_threads] = start;
// Now sort in parallel. // Now sort in parallel.
#pragma omp parallel num_threads(num_threads) parallel_sort_mwms_pu(&sd, comp);
parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp); } //parallel
// XXX sd as RAII
delete[] starts; delete[] starts;
delete[] sd.temporaries; delete[] sd.temporaries;
delete[] sd.sorting_places; delete[] sd.sorting_places;
delete[] sd.merging_places; delete[] sd.merging_places;
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
delete[] sd.samples; delete[] sd.samples;
delete[] sd.offsets; delete[] sd.offsets;
delete[] sd.pieces; delete[] sd.pieces;
delete[] pus;
} }
} //namespace __gnu_parallel
}
#endif #endif

View File

@ -43,54 +43,71 @@
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/basic_iterator.h> #include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop. * iterators, using an OpenMP for loop.
* *
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding * @param o User-supplied functor (comparator, predicate, adding
* functor, etc.). * functor, etc.).
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound) for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type; typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{ output = r(output, thread_results[i]);
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
}
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
@ -100,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif

View File

@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin, for_each_template_random_access_omp_loop_static(
RandomAccessIterator end, RandomAccessIterator begin,
Op o, Fu& f, Red r, RandomAccessIterator end,
Result base, Result& output, Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename
typedef typename traits_type::difference_type difference_type; std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
for (thread_index_t i = 0; i < num_threads; i++)
thread_results[i] = Result();
}
thread_index_t iam = omp_get_thread_num();
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{ output = r(output, thread_results[i]);
thread_results[i] = r(thread_results[i], f(o, begin+i));
}
#pragma omp parallel num_threads(num_threads)
{
#pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
{
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
}
}
for (thread_index_t i = 0; i < num_threads; i++)
{
output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
@ -106,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif

View File

@ -41,69 +41,80 @@
#include <omp.h> #include <omp.h>
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting * iterators, using hand-crafted parallelization by equal splitting
* the work. * the work.
* *
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding * @param o User-supplied functor (comparator, predicate, adding
* functor, ...) * functor, ...)
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_ed(RandomAccessIterator begin, for_each_template_random_access_ed(
RandomAccessIterator end, Op o, Fu& f, RandomAccessIterator begin,
Red r, Result base, Result& output, RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin; const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads()); Result *thread_results;
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
thread_index_t num_threads = static_cast<thread_index_t>(dmax); thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads]; thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads) // Neutral element.
{ Result reduct = Result();
// Neutral element.
Result reduct = Result();
thread_index_t p = num_threads; difference_type
thread_index_t iam = omp_get_thread_num(); start = equally_split_point(length, num_threads, iam),
difference_type start = iam * length / p; stop = equally_split_point(length, num_threads, iam + 1);
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
if (start < limit) if (start < stop)
{ {
reduct = f(o, begin + start); reduct = f(o, begin + start);
start++; ++start;
} }
for (; start < limit; start++) for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start)); reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct; thread_results[iam] = reduct;
} } //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]); output = r(output, thread_results[i]);

View File

@ -48,130 +48,156 @@ namespace __gnu_parallel
{ {
// Problem: there is no 0-element given. // Problem: there is no 0-element given.
/** @brief Base case prefix sum routine. /** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @param value Start value. Must be passed since the neutral * @param value Start value. Must be passed since the neutral
* element is unknown in general. * element is unknown in general.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end, parallel_partial_sum_basecase(
OutputIterator result, BinaryOperation bin_op, InputIterator begin, InputIterator end,
typename std::iterator_traits<InputIterator>::value_type value) OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
{ {
if (begin == end) if (begin == end)
return result; return result;
while (begin != end) while (begin != end)
{ {
value = bin_op(value, *begin); value = bin_op(value, *begin);
*result = value; *result = value;
result++; result++;
begin++; begin++;
} }
return result; return result;
} }
/** @brief Parallel partial sum implementation, two-phase approach, /** @brief Parallel partial sum implementation, two-phase approach,
no recursion. no recursion.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end, parallel_partial_sum_linear(
OutputIterator result, BinaryOperation bin_op, InputIterator begin, InputIterator end,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads) OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n)
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1)) thread_index_t num_threads =
num_threads = static_cast<thread_index_t>(n - 1); std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2) if (num_threads < 2)
{ {
*result = *begin; *result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin); return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
} }
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2))); difference_type* borders;
value_type* sums;
if (Settings::partial_sum_dilatation == 1.0f) # pragma omp parallel num_threads(num_threads)
equally_split(n, num_threads + 1, borders);
else
{ {
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length; # pragma omp single
borders[0] = 0; {
for (int i = 1; i < (num_threads + 1); i++) num_threads = omp_get_num_threads();
{
borders[i] = borderstart;
borderstart += chunk_length;
}
borders[num_threads + 1] = n;
}
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads)); borders = new difference_type[num_threads + 2];
OutputIterator target_end;
#pragma omp parallel num_threads(num_threads) if (Settings::partial_sum_dilatation == 1.0f)
{ equally_split(n, num_threads + 1, borders);
int id = omp_get_thread_num(); else
if (id == 0) {
{ difference_type chunk_length =
*result = *begin; ((double)n /
parallel_partial_sum_basecase(begin + 1, begin + borders[1], ((double)num_threads + Settings::partial_sum_dilatation)),
result + 1, bin_op, *begin); borderstart = n - num_threads * chunk_length;
sums[0] = *(result + borders[1] - 1); borders[0] = 0;
} for (int i = 1; i < (num_threads + 1); i++)
else {
{ borders[i] = borderstart;
sums[id] = std::accumulate(begin + borders[id] + 1, borderstart += chunk_length;
begin + borders[id + 1], }
*(begin + borders[id]), borders[num_threads + 1] = n;
bin_op, __gnu_parallel::sequential_tag()); }
}
#pragma omp barrier sums = static_cast<value_type*>(
::operator new(sizeof(value_type) * num_threads));
OutputIterator target_end;
} //single
#pragma omp single int iam = omp_get_thread_num();
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, if (iam == 0)
bin_op, sums[0]); {
*result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
result + 1, bin_op, *begin);
sums[0] = *(result + borders[1] - 1);
}
else
{
sums[iam] = std::accumulate(begin + borders[iam] + 1,
begin + borders[iam + 1],
*(begin + borders[iam]),
bin_op, __gnu_parallel::sequential_tag());
}
#pragma omp barrier # pragma omp barrier
// Still same team. # pragma omp single
parallel_partial_sum_basecase(begin + borders[id + 1], parallel_partial_sum_basecase(
begin + borders[id + 2], sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
result + borders[id + 1], bin_op,
sums[id]);
}
delete [] sums; # pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[iam + 2],
result + borders[iam + 1], bin_op,
sums[iam]);
} //parallel
delete[] sums;
delete[] borders;
return result + n; return result + n;
} }
/** @brief Parallel partial sum front-end. /** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end, parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op) OutputIterator result, BinaryOperation bin_op)
{ {
_GLIBCXX_CALL(begin - end); _GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -179,18 +205,15 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm) switch (Settings::partial_sum_algorithm)
{ {
case Settings::LINEAR: case Settings::LINEAR:
// Need an initial offset. // Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op, return parallel_partial_sum_linear(begin, end, result, bin_op, n);
n, num_threads);
default: default:
// Partial_sum algorithm not implemented. // Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0); _GLIBCXX_PARALLEL_ASSERT(0);
return result + n; return result + n;
} }
} }
} }

View File

@ -45,21 +45,21 @@
#include <bits/stl_algo.h> #include <bits/stl_algo.h>
#include <parallel/parallel.h> #include <parallel/parallel.h>
/** @brief Decide whether to declare certain variable volatile in this file. */ /** @brief Decide whether to declare certain variables volatile. */
#define _GLIBCXX_VOLATILE volatile #define _GLIBCXX_VOLATILE volatile
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel implementation of std::partition. /** @brief Parallel implementation of std::partition.
* @param begin Begin iterator of input sequence to split. * @param begin Begin iterator of input sequence to split.
* @param end End iterator of input sequence to split. * @param end End iterator of input sequence to split.
* @param pred Partition predicate, possibly including some kind of pivot. * @param pred Partition predicate, possibly including some kind of pivot.
* @param max_num_threads Maximum number of threads to use for this task. * @param num_threads Maximum number of threads to use for this task.
* @return Number of elements not fulfilling the predicate. */ * @return Number of elements not fulfilling the predicate. */
template<typename RandomAccessIterator, typename Predicate> template<typename RandomAccessIterator, typename Predicate>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
Predicate pred, thread_index_t max_num_threads) Predicate pred, thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -74,212 +74,238 @@ namespace __gnu_parallel
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right; _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
_GLIBCXX_VOLATILE difference_type leftnew, rightnew; _GLIBCXX_VOLATILE difference_type leftnew, rightnew;
bool* reserved_left, * reserved_right; bool* reserved_left = NULL, * reserved_right = NULL;
reserved_left = new bool[max_num_threads];
reserved_right = new bool[max_num_threads];
difference_type chunk_size; difference_type chunk_size;
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
else
chunk_size = Settings::partition_chunk_size;
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
// At least good for two processors. //at least two chunks per thread
while (right - left + 1 >= 2 * max_num_threads * chunk_size) if(right - left + 1 >= 2 * num_threads * chunk_size)
# pragma omp parallel num_threads(num_threads)
{ {
difference_type num_chunks = (right - left + 1) / chunk_size; # pragma omp single
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2); {
num_threads = omp_get_num_threads();
reserved_left = new bool[num_threads];
reserved_right = new bool[num_threads];
for (int r = 0; r < num_threads; r++) if (Settings::partition_chunk_share > 0.0)
{ chunk_size = std::max<difference_type>(
reserved_left[r] = false; Settings::partition_chunk_size,
reserved_right[r] = false; (double)n * Settings::partition_chunk_share /
} (double)num_threads);
leftover_left = 0; else
leftover_right = 0; chunk_size = Settings::partition_chunk_size;
}
#pragma omp parallel num_threads(num_threads) while (right - left + 1 >= 2 * num_threads * chunk_size)
{ {
// Private. # pragma omp single
difference_type thread_left, thread_left_border, thread_right, thread_right_border; {
thread_left = left + 1; difference_type num_chunks = (right - left + 1) / chunk_size;
// Just to satisfy the condition below. for (int r = 0; r < num_threads; r++)
thread_left_border = thread_left - 1; {
thread_right = n - 1; reserved_left[r] = false;
thread_right_border = thread_right + 1; reserved_right[r] = false;
}
leftover_left = 0;
leftover_right = 0;
} //implicit barrier
bool iam_finished = false; // Private.
while (!iam_finished) difference_type thread_left, thread_left_border,
{ thread_right, thread_right_border;
if (thread_left > thread_left_border) thread_left = left + 1;
{
omp_set_lock(&result_lock);
if (left + (chunk_size - 1) > right)
iam_finished = true;
else
{
thread_left = left;
thread_left_border = left + (chunk_size - 1);
left += chunk_size;
}
omp_unset_lock(&result_lock);
}
if (thread_right < thread_right_border) // Just to satisfy the condition below.
{ thread_left_border = thread_left - 1;
omp_set_lock(&result_lock); thread_right = n - 1;
if (left > right - (chunk_size - 1)) thread_right_border = thread_right + 1;
iam_finished = true;
else
{
thread_right = right;
thread_right_border = right - (chunk_size - 1);
right -= chunk_size;
}
omp_unset_lock(&result_lock);
}
if (iam_finished) bool iam_finished = false;
break; while (!iam_finished)
{
if (thread_left > thread_left_border)
{
omp_set_lock(&result_lock);
if (left + (chunk_size - 1) > right)
iam_finished = true;
else
{
thread_left = left;
thread_left_border = left + (chunk_size - 1);
left += chunk_size;
}
omp_unset_lock(&result_lock);
}
// Swap as usual. if (thread_right < thread_right_border)
while (thread_left < thread_right) {
{ omp_set_lock(&result_lock);
while (pred(begin[thread_left]) && thread_left <= thread_left_border) if (left > right - (chunk_size - 1))
thread_left++; iam_finished = true;
while (!pred(begin[thread_right]) && thread_right >= thread_right_border) else
thread_right--; {
thread_right = right;
thread_right_border = right - (chunk_size - 1);
right -= chunk_size;
}
omp_unset_lock(&result_lock);
}
if (thread_left > thread_left_border || thread_right < thread_right_border) if (iam_finished)
// Fetch new chunk(s). break;
break;
std::swap(begin[thread_left], begin[thread_right]); // Swap as usual.
thread_left++; while (thread_left < thread_right)
thread_right--; {
} while (pred(begin[thread_left])
} && thread_left <= thread_left_border)
thread_left++;
while (!pred(begin[thread_right])
&& thread_right >= thread_right_border)
thread_right--;
// Now swap the leftover chunks to the right places. if (thread_left > thread_left_border
if (thread_left <= thread_left_border) || thread_right < thread_right_border)
#pragma omp atomic // Fetch new chunk(s).
leftover_left++; break;
if (thread_right >= thread_right_border)
#pragma omp atomic
leftover_right++;
#pragma omp barrier std::swap(begin[thread_left], begin[thread_right]);
thread_left++;
thread_right--;
}
}
#pragma omp single // Now swap the leftover chunks to the right places.
{ if (thread_left <= thread_left_border)
leftnew = left - leftover_left * chunk_size; # pragma omp atomic
rightnew = right + leftover_right * chunk_size; leftover_left++;
} if (thread_right >= thread_right_border)
# pragma omp atomic
leftover_right++;
#pragma omp barrier # pragma omp barrier
// <=> thread_left_border + (chunk_size - 1) >= leftnew # pragma omp single
if (thread_left <= thread_left_border {
&& thread_left_border >= leftnew) leftnew = left - leftover_left * chunk_size;
{ rightnew = right + leftover_right * chunk_size;
// Chunk already in place, reserve spot. }
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
}
// <=> thread_right_border - (chunk_size - 1) <= rightnew # pragma omp barrier
if (thread_right >= thread_right_border
&& thread_right_border <= rightnew)
{
// Chunk already in place, reserve spot.
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
}
#pragma omp barrier // <=> thread_left_border + (chunk_size - 1) >= leftnew
if (thread_left <= thread_left_border
&& thread_left_border >= leftnew)
{
// Chunk already in place, reserve spot.
reserved_left[(left - (thread_left_border + 1)) / chunk_size]
= true;
}
if (thread_left <= thread_left_border && thread_left_border < leftnew) // <=> thread_right_border - (chunk_size - 1) <= rightnew
{ if (thread_right >= thread_right_border
// Find spot and swap. && thread_right_border <= rightnew)
difference_type swapstart = -1; {
omp_set_lock(&result_lock); // Chunk already in place, reserve spot.
for (int r = 0; r < leftover_left; r++) reserved_right
[((thread_right_border - 1) - right) / chunk_size]
= true;
}
# pragma omp barrier
if (thread_left <= thread_left_border
&& thread_left_border < leftnew)
{
// Find spot and swap.
difference_type swapstart = -1;
omp_set_lock(&result_lock);
for (int r = 0; r < leftover_left; r++)
if (!reserved_left[r]) if (!reserved_left[r])
{ {
reserved_left[r] = true; reserved_left[r] = true;
swapstart = left - (r + 1) * chunk_size; swapstart = left - (r + 1) * chunk_size;
break; break;
} }
omp_unset_lock(&result_lock); omp_unset_lock(&result_lock);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart); std::swap_ranges(
} begin + thread_left_border - (chunk_size - 1),
begin + thread_left_border + 1,
begin + swapstart);
}
if (thread_right >= thread_right_border if (thread_right >= thread_right_border
&& thread_right_border > rightnew) && thread_right_border > rightnew)
{ {
// Find spot and swap // Find spot and swap
difference_type swapstart = -1; difference_type swapstart = -1;
omp_set_lock(&result_lock); omp_set_lock(&result_lock);
for (int r = 0; r < leftover_right; r++) for (int r = 0; r < leftover_right; r++)
if (!reserved_right[r]) if (!reserved_right[r])
{ {
reserved_right[r] = true; reserved_right[r] = true;
swapstart = right + r * chunk_size + 1; swapstart = right + r * chunk_size + 1;
break; break;
} }
omp_unset_lock(&result_lock); omp_unset_lock(&result_lock);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart); std::swap_ranges(begin + thread_right_border,
} begin + thread_right_border + chunk_size,
begin + swapstart);
}
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
for (int r = 0; r < leftover_left; r++) for (int r = 0; r < leftover_left; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
for (int r = 0; r < leftover_right; r++) for (int r = 0; r < leftover_right; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
} }
#pragma omp barrier # pragma omp barrier
#endif #endif
#pragma omp barrier # pragma omp barrier
left = leftnew;
right = rightnew; left = leftnew;
} right = rightnew;
} // end "recursion" }
# pragma omp flush(left, right)
} // end "recursion" //parallel
difference_type final_left = left, final_right = right; difference_type final_left = left, final_right = right;
while (final_left < final_right) while (final_left < final_right)
{ {
// Go right until key is geq than pivot. // Go right until key is geq than pivot.
while (pred(begin[final_left]) && final_left < final_right) while (pred(begin[final_left]) && final_left < final_right)
final_left++; final_left++;
// Go left until key is less than pivot. // Go left until key is less than pivot.
while (!pred(begin[final_right]) && final_left < final_right) while (!pred(begin[final_right]) && final_left < final_right)
final_right--; final_right--;
if (final_left == final_right) if (final_left == final_right)
break; break;
std::swap(begin[final_left], begin[final_right]); std::swap(begin[final_left], begin[final_right]);
final_left++; final_left++;
final_right--; final_right--;
} }
// All elements on the left side are < piv, all elements on the // All elements on the left side are < piv, all elements on the
@ -298,14 +324,14 @@ namespace __gnu_parallel
return final_left + 1; return final_left + 1;
} }
/** /**
* @brief Parallel implementation of std::nth_element(). * @brief Parallel implementation of std::nth_element().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param nth Iterator of element that must be in position afterwards. * @param nth Iterator of element that must be in position afterwards.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
RandomAccessIterator end, Comparator comp) RandomAccessIterator end, Comparator comp)
@ -324,65 +350,65 @@ namespace __gnu_parallel
// Break if input range to small. // Break if input range to small.
while (static_cast<sequence_index_t>(end - begin) >= minimum_length) while (static_cast<sequence_index_t>(end - begin) >= minimum_length)
{ {
difference_type n = end - begin; difference_type n = end - begin;
RandomAccessIterator pivot_pos = begin + rng(n); RandomAccessIterator pivot_pos = begin + rng(n);
// Swap pivot_pos value to end. // Swap pivot_pos value to end.
if (pivot_pos != (end - 1)) if (pivot_pos != (end - 1))
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
// XXX Comparator must have first_value_type, second_value_type, result_type // XXX Comparator must have first_value_type, second_value_type, result_type
// Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> > // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> >
// pivot_pos == std::pair<S, int>* // pivot_pos == std::pair<S, int>*
// XXX binder2nd only for RandomAccessIterators?? // XXX binder2nd only for RandomAccessIterators??
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
// Divide, leave pivot unchanged in last place. // Divide, leave pivot unchanged in last place.
RandomAccessIterator split_pos1, split_pos2; RandomAccessIterator split_pos1, split_pos2;
split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads()); split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
// Left side: < pivot_pos; right side: >= pivot_pos // Left side: < pivot_pos; right side: >= pivot_pos
// Swap pivot back to middle. // Swap pivot back to middle.
if (split_pos1 != pivot_pos) if (split_pos1 != pivot_pos)
std::swap(*split_pos1, *pivot_pos); std::swap(*split_pos1, *pivot_pos);
pivot_pos = split_pos1; pivot_pos = split_pos1;
// In case all elements are equal, split_pos1 == 0 // In case all elements are equal, split_pos1 == 0
if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7)) if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
{ {
// Very unequal split, one part smaller than one 128th // Very unequal split, one part smaller than one 128th
// elements not stricly larger than the pivot. // elements not stricly larger than the pivot.
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
// Find other end of pivot-equal range. // Find other end of pivot-equal range.
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
} }
else else
// Only skip the pivot. // Only skip the pivot.
split_pos2 = split_pos1 + 1; split_pos2 = split_pos1 + 1;
// Compare iterators. // Compare iterators.
if (split_pos2 <= nth) if (split_pos2 <= nth)
begin = split_pos2; begin = split_pos2;
else if (nth < split_pos1) else if (nth < split_pos1)
end = split_pos1; end = split_pos1;
else else
break; break;
} }
// Only at most Settings::partition_minimal_n elements left. // Only at most Settings::partition_minimal_n elements left.
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
} }
/** @brief Parallel implementation of std::partial_sort(). /** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param middle Sort until this position. * @param middle Sort until this position.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. */ * @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp) parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
{ {
@ -390,7 +416,7 @@ namespace __gnu_parallel
std::sort(begin, middle, comp); std::sort(begin, middle, comp);
} }
} //namespace __gnu_parallel } //namespace __gnu_parallel
#undef _GLIBCXX_VOLATILE #undef _GLIBCXX_VOLATILE

View File

@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part. * this part.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type inline
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type
Comparator comp, parallel_sort_qs_divide(
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank, RandomAccessIterator begin,
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads) RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type
pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
num_samples = std::min(num_samples, n); num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++) for (difference_type s = 0; s < num_samples; s++)
{ {
const unsigned long long index = static_cast<unsigned long long>(s) const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples; * n / num_samples;
samples[s] = begin[index]; new(samples + s) value_type(begin[index]);
} }
__gnu_sequential::sort(samples, samples + num_samples, comp); __gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n]; value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads); difference_type split = parallel_partition(begin, end, pred, num_threads);
return split; return split;
@ -93,7 +103,10 @@ namespace __gnu_parallel
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads) parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -101,8 +114,8 @@ namespace __gnu_parallel
if (num_threads <= 1) if (num_threads <= 1)
{ {
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
return; return;
} }
difference_type n = end - begin, pivot_rank; difference_type n = end - begin, pivot_rank;
@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1) if (n <= 1)
return; return;
thread_index_t num_processors_left; thread_index_t num_threads_left;
if ((num_threads % 2) == 1) if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1; num_threads_left = num_threads / 2 + 1;
else else
num_processors_left = num_threads / 2; num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads; pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank, difference_type split = parallel_sort_qs_divide(
Settings::sort_qs_num_samples_preset, num_threads); begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections #pragma omp parallel sections
{ {
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left); parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left); parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
} }
} }
@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_qs(
Comparator comp, RandomAccessIterator begin,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{ {
_GLIBCXX_CALL(n) _GLIBCXX_CALL(n)
@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid. // Hard to avoid.
omp_set_num_threads(num_threads); omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads); parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
} }
} //namespace __gnu_parallel } //namespace __gnu_parallel
#endif #endif

View File

@ -45,16 +45,16 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Type to hold the index of a bin. /** @brief Type to hold the index of a bin.
* *
* Since many variables of this type are allocated, it should be * Since many variables of this type are allocated, it should be
* chosen as small as possible. * chosen as small as possible.
*/ */
typedef unsigned short bin_index; typedef unsigned short bin_index;
/** @brief Data known to every thread participating in /** @brief Data known to every thread participating in
__gnu_parallel::parallel_random_shuffle(). */ __gnu_parallel::parallel_random_shuffle(). */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct DRandomShufflingGlobalData struct DRandomShufflingGlobalData
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
@ -90,18 +90,15 @@ namespace __gnu_parallel
: source(_source) { } : source(_source) { }
}; };
/** @brief Local data for a thread participating in /** @brief Local data for a thread participating in
__gnu_parallel::parallel_random_shuffle(). __gnu_parallel::parallel_random_shuffle().
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
struct DRSSorterPU struct DRSSorterPU
{ {
/** @brief Number of threads participating in total. */ /** @brief Number of threads participating in total. */
int num_threads; int num_threads;
/** @brief Number of owning thread. */
int iam;
/** @brief Begin index for bins taken care of by this thread. */ /** @brief Begin index for bins taken care of by this thread. */
bin_index bins_begin; bin_index bins_begin;
@ -115,29 +112,29 @@ namespace __gnu_parallel
DRandomShufflingGlobalData<RandomAccessIterator>* sd; DRandomShufflingGlobalData<RandomAccessIterator>* sd;
}; };
/** @brief Generate a random number in @c [0,2^logp). /** @brief Generate a random number in @c [0,2^logp).
* @param logp Logarithm (basis 2) of the upper range bound. * @param logp Logarithm (basis 2) of the upper range bound.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomNumberGenerator> template<typename RandomNumberGenerator>
inline int inline int
random_number_pow2(int logp, RandomNumberGenerator& rng) random_number_pow2(int logp, RandomNumberGenerator& rng)
{ return rng.genrand_bits(logp); } { return rng.genrand_bits(logp); }
/** @brief Random shuffle code executed by each thread. /** @brief Random shuffle code executed by each thread.
* @param pus Array of thread-local data records. */ * @param pus Array of thread-local data records. */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
RandomNumberGenerator>* pus) RandomNumberGenerator>* pus)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()]; thread_index_t iam = omp_get_thread_num();
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
thread_index_t iam = d->iam;
// Indexing: dist[bin][processor] // Indexing: dist[bin][processor]
difference_type length = sd->starts[iam + 1] - sd->starts[iam]; difference_type length = sd->starts[iam + 1] - sd->starts[iam];
@ -156,35 +153,35 @@ namespace __gnu_parallel
// First main loop. // First main loop.
for (difference_type i = 0; i < length; i++) for (difference_type i = 0; i < length; i++)
{ {
bin_index oracle = random_number_pow2(num_bits, rng); bin_index oracle = random_number_pow2(num_bits, rng);
oracles[i] = oracle; oracles[i] = oracle;
// To allow prefix (partial) sum. // To allow prefix (partial) sum.
dist[oracle + 1]++; dist[oracle + 1]++;
} }
for (bin_index b = 0; b < sd->num_bins + 1; b++) for (bin_index b = 0; b < sd->num_bins + 1; b++)
sd->dist[b][iam + 1] = dist[b]; sd->dist[b][iam + 1] = dist[b];
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
// total number of items in bin s // total number of items in bin s
for (bin_index s = 0; s < sd->num_bins; s++) for (bin_index s = 0; s < sd->num_bins; s++)
__gnu_sequential::partial_sum(sd->dist[s + 1], __gnu_sequential::partial_sum(sd->dist[s + 1],
sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1] + d->num_threads + 1,
sd->dist[s + 1]); sd->dist[s + 1]);
} }
#pragma omp barrier # pragma omp barrier
sequence_index_t offset = 0, global_offset = 0; sequence_index_t offset = 0, global_offset = 0;
for (bin_index s = 0; s < d->bins_begin; s++) for (bin_index s = 0; s < d->bins_begin; s++)
global_offset += sd->dist[s + 1][d->num_threads]; global_offset += sd->dist[s + 1][d->num_threads];
#pragma omp barrier # pragma omp barrier
for (bin_index s = d->bins_begin; s < d->bins_end; s++) for (bin_index s = d->bins_begin; s < d->bins_end; s++)
{ {
@ -193,9 +190,10 @@ namespace __gnu_parallel
offset = sd->dist[s + 1][d->num_threads]; offset = sd->dist[s + 1][d->num_threads];
} }
sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset)); sd->temporaries[iam] = static_cast<value_type*>(
::operator new(sizeof(value_type) * offset));
#pragma omp barrier # pragma omp barrier
// Draw local copies to avoid false sharing. // Draw local copies to avoid false sharing.
for (bin_index b = 0; b < sd->num_bins + 1; b++) for (bin_index b = 0; b < sd->num_bins + 1; b++)
@ -211,11 +209,11 @@ namespace __gnu_parallel
// Distribute according to oracles, second main loop. // Distribute according to oracles, second main loop.
for (difference_type i = 0; i < length; i++) for (difference_type i = 0; i < length; i++)
{ {
bin_index target_bin = oracles[i]; bin_index target_bin = oracles[i];
thread_index_t target_p = bin_proc[target_bin]; thread_index_t target_p = bin_proc[target_bin];
// Last column [d->num_threads] stays unchanged. // Last column [d->num_threads] stays unchanged.
temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start); temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
} }
delete[] oracles; delete[] oracles;
@ -223,23 +221,27 @@ namespace __gnu_parallel
delete[] bin_proc; delete[] bin_proc;
delete[] temporaries; delete[] temporaries;
#pragma omp barrier # pragma omp barrier
// Shuffle bins internally. // Shuffle bins internally.
for (bin_index b = d->bins_begin; b < d->bins_end; b++) for (bin_index b = d->bins_begin; b < d->bins_end; b++)
{ {
value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), value_type* begin =
* end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; sd->temporaries[iam] +
sequential_random_shuffle(begin, end, rng); ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); * end =
sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
sequential_random_shuffle(begin, end, rng);
std::copy(begin, end, sd->source + global_offset +
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
} }
delete[] sd->temporaries[iam]; delete[] sd->temporaries[iam];
} }
/** @brief Round up to the next greater power of 2. /** @brief Round up to the next greater power of 2.
* @param x Integer to round up */ * @param x Integer to round up */
template<typename T> template<typename T>
T T
round_up_to_pow2(T x) round_up_to_pow2(T x)
{ {
@ -249,16 +251,21 @@ namespace __gnu_parallel
return (T)1 << (log2(x - 1) + 1); return (T)1 << (log2(x - 1) + 1);
} }
/** @brief Main parallel random shuffle step. /** @brief Main parallel random shuffle step.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng) parallel_random_shuffle_drs(
RandomAccessIterator begin,
RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
thread_index_t num_threads,
RandomNumberGenerator& rng)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -275,87 +282,99 @@ namespace __gnu_parallel
// Try the L1 cache first. // Try the L1 cache first.
// Must fit into L1. // Must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); num_bins_cache = std::max<difference_type>(
1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache); num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size. // Power of 2 and at least one element per bin, at most the TLB size.
num_bins = std::min(n, (difference_type)num_bins_cache); num_bins = std::min<difference_type>(n, num_bins_cache);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin. // 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
if (num_bins < num_bins_cache) if (num_bins < num_bins_cache)
{ {
#endif #endif
// Now try the L2 cache // Now try the L2 cache
// Must fit into L2 // Must fit into L2
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
num_bins_cache = round_up_to_pow2(num_bins_cache); 1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2. // No more buckets than TLB entries, power of 2.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); num_bins = static_cast<bin_index>(
// Power of 2 and at least one element per bin, at most the TLB size. std::min(n, static_cast<difference_type>(num_bins_cache)));
// Power of 2 and at least one element per bin, at most the TLB size.
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin. // 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins = std::min(
static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
} }
#endif #endif
num_threads = std::min((bin_index)num_threads, (bin_index)num_bins); num_threads = std::min<bin_index>(num_threads, num_bins);
if (num_threads <= 1) if (num_threads <= 1)
return sequential_random_shuffle(begin, end, rng); return sequential_random_shuffle(begin, end, rng);
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
DRSSorterPU<RandomAccessIterator, random_number >* pus;
difference_type* starts;
DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads]; # pragma omp parallel num_threads(num_threads)
sd.temporaries = new value_type*[num_threads];
//sd.oracles = new bin_index[n];
sd.dist = new difference_type*[num_bins + 1];
sd.bin_proc = new thread_index_t[num_bins];
for (bin_index b = 0; b < num_bins + 1; b++)
sd.dist[b] = new difference_type[num_threads + 1];
for (bin_index b = 0; b < (num_bins + 1); b++)
{ {
sd.dist[0][0] = 0; # pragma omp single
sd.dist[b][0] = 0; {
} pus = new DRSSorterPU<RandomAccessIterator, random_number>
difference_type* starts = sd.starts = new difference_type[num_threads + 1]; [num_threads];
int bin_cursor = 0;
sd.num_bins = num_bins;
sd.num_bits = log2(num_bins);
difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0; sd.temporaries = new value_type*[num_threads];
int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads; sd.dist = new difference_type*[num_bins + 1];
for (int i = 0; i < num_threads; i++) sd.bin_proc = new thread_index_t[num_bins];
{ for (bin_index b = 0; b < num_bins + 1; b++)
starts[i] = start; sd.dist[b] = new difference_type[num_threads + 1];
start += (i < split) ? (chunk_length + 1) : chunk_length; for (bin_index b = 0; b < (num_bins + 1); b++)
int j = pus[i].bins_begin = bin_cursor; {
sd.dist[0][0] = 0;
sd.dist[b][0] = 0;
}
starts = sd.starts = new difference_type[num_threads + 1];
int bin_cursor = 0;
sd.num_bins = num_bins;
sd.num_bits = log2(num_bins);
// Range of bins for this processor. difference_type chunk_length = n / num_threads,
bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length; split = n % num_threads, start = 0;
pus[i].bins_end = bin_cursor; difference_type bin_chunk_length = num_bins / num_threads,
for (; j < bin_cursor; j++) bin_split = num_bins % num_threads;
sd.bin_proc[j] = i; for (thread_index_t i = 0; i < num_threads; i++)
pus[i].num_threads = num_threads; {
pus[i].iam = i; starts[i] = start;
pus[i].seed = rng(std::numeric_limits<uint32>::max()); start += (i < split) ? (chunk_length + 1) : chunk_length;
pus[i].sd = &sd; int j = pus[i].bins_begin = bin_cursor;
}
starts[num_threads] = start;
// Now shuffle in parallel. // Range of bins for this processor.
#pragma omp parallel num_threads(num_threads) bin_cursor += (i < bin_split) ?
parallel_random_shuffle_drs_pu(pus); (bin_chunk_length + 1) : bin_chunk_length;
pus[i].bins_end = bin_cursor;
for (; j < bin_cursor; j++)
sd.bin_proc[j] = i;
pus[i].num_threads = num_threads;
pus[i].seed = rng(std::numeric_limits<uint32>::max());
pus[i].sd = &sd;
}
starts[num_threads] = start;
} //single
// Now shuffle in parallel.
parallel_random_shuffle_drs_pu(pus);
}
delete[] starts; delete[] starts;
delete[] sd.bin_proc; delete[] sd.bin_proc;
@ -367,16 +386,16 @@ namespace __gnu_parallel
delete[] pus; delete[] pus;
} }
/** @brief Sequential cache-efficient random shuffle. /** @brief Sequential cache-efficient random shuffle.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
sequential_random_shuffle(RandomAccessIterator begin, sequential_random_shuffle(RandomAccessIterator begin,
RandomAccessIterator end, RandomAccessIterator end,
RandomNumberGenerator& rng) RandomNumberGenerator& rng)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
@ -388,7 +407,9 @@ namespace __gnu_parallel
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
// Try the L1 cache first, must fit into L1. // Try the L1 cache first, must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); num_bins_cache =
std::max<difference_type>
(1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache); num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
@ -403,19 +424,23 @@ namespace __gnu_parallel
if (num_bins < num_bins_cache) if (num_bins < num_bins_cache)
{ {
#endif #endif
// Now try the L2 cache, must fit into L2. // Now try the L2 cache, must fit into L2.
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); num_bins_cache =
num_bins_cache = round_up_to_pow2(num_bins_cache); static_cast<bin_index>(std::max<difference_type>(
1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size. // Power of 2 and at least one element per bin, at most the TLB size.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); num_bins = static_cast<bin_index>
(std::min(n, static_cast<difference_type>(num_bins_cache)));
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin // 2 TLB entries needed per bin
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins =
std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
} }
#endif #endif
@ -424,58 +449,62 @@ namespace __gnu_parallel
if (num_bins > 1) if (num_bins > 1)
{ {
value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n)); value_type* target = static_cast<value_type*>(
bin_index* oracles = new bin_index[n]; ::operator new(sizeof(value_type) * n));
difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1]; bin_index* oracles = new bin_index[n];
difference_type* dist0 = new difference_type[num_bins + 1],
* dist1 = new difference_type[num_bins + 1];
for (int b = 0; b < num_bins + 1; b++) for (int b = 0; b < num_bins + 1; b++)
dist0[b] = 0; dist0[b] = 0;
random_number bitrng(rng(0xFFFFFFFF)); random_number bitrng(rng(0xFFFFFFFF));
for (difference_type i = 0; i < n; i++) for (difference_type i = 0; i < n; i++)
{ {
bin_index oracle = random_number_pow2(num_bits, bitrng); bin_index oracle = random_number_pow2(num_bits, bitrng);
oracles[i] = oracle; oracles[i] = oracle;
// To allow prefix (partial) sum. // To allow prefix (partial) sum.
dist0[oracle + 1]++; dist0[oracle + 1]++;
} }
// Sum up bins. // Sum up bins.
__gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
for (int b = 0; b < num_bins + 1; b++) for (int b = 0; b < num_bins + 1; b++)
dist1[b] = dist0[b]; dist1[b] = dist0[b];
// Distribute according to oracles. // Distribute according to oracles.
for (difference_type i = 0; i < n; i++) for (difference_type i = 0; i < n; i++)
target[(dist0[oracles[i]])++] = *(begin + i); target[(dist0[oracles[i]])++] = *(begin + i);
for (int b = 0; b < num_bins; b++) for (int b = 0; b < num_bins; b++)
{ {
sequential_random_shuffle(target + dist1[b], target + dist1[b + 1], sequential_random_shuffle(target + dist1[b],
rng); target + dist1[b + 1],
} rng);
}
delete[] dist0; delete[] dist0;
delete[] dist1; delete[] dist1;
delete[] oracles; delete[] oracles;
delete[] target; delete[] target;
} }
else else
__gnu_sequential::random_shuffle(begin, end, rng); __gnu_sequential::random_shuffle(begin, end, rng);
} }
/** @brief Parallel random public call. /** @brief Parallel random public call.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, parallel_random_shuffle(RandomAccessIterator begin,
RandomNumberGenerator rng = random_number()) RandomAccessIterator end,
RandomNumberGenerator rng = random_number())
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;

View File

@ -53,10 +53,10 @@ namespace __gnu_parallel
* @param length Length of sequence to search for. * @param length Length of sequence to search for.
* @param advances Returned offsets. * @param advances Returned offsets.
*/ */
template<typename RandomAccessIterator, typename _DifferenceTp> template<typename RandomAccessIterator, typename _DifferenceTp>
void void
calc_borders(RandomAccessIterator elements, _DifferenceTp length, calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off) _DifferenceTp* off)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
@ -66,9 +66,9 @@ namespace __gnu_parallel
difference_type k = 0; difference_type k = 0;
for (difference_type j = 2; j <= length; j++) for (difference_type j = 2; j <= length; j++)
{ {
while ((k >= 0) && !(elements[k] == elements[j-1])) while ((k >= 0) && !(elements[k] == elements[j-1]))
k = off[k]; k = off[k];
off[j] = ++k; off[j] = ++k;
} }
} }
@ -81,11 +81,14 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence. * @param end2 End iterator of second sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @return Place of finding in first sequences. */ * @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred> template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1 _RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1, search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2, _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
Pred pred) Pred pred)
{ {
typedef std::iterator_traits<_RandomAccessIterator1> traits_type; typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
@ -103,60 +106,71 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end. // Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1); difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long. // Pattern too long.
if (input_length < 0) if (input_length < 0)
return end1; return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
difference_type borders[num_threads + 1]; thread_index_t num_threads =
__gnu_parallel::equally_split(input_length, num_threads, borders); std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length]; difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances); calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
thread_index_t iam = omp_get_thread_num(); # pragma omp single
{
num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
equally_split(input_length, num_threads, splitters);
}
difference_type start = borders[iam], stop = borders[iam + 1]; thread_index_t iam = omp_get_thread_num();
difference_type pos_in_pattern = 0; difference_type start = splitters[iam], stop = splitters[iam + 1];
bool found_pattern = false;
while (start <= stop && !found_pattern) difference_type pos_in_pattern = 0;
{ bool found_pattern = false;
// Get new value of result.
#pragma omp flush(result)
// No chance for this thread to find first occurrence.
if (result < start)
break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
{
// Found new candidate for result.
omp_set_lock(&result_lock);
result = std::min(result, start);
omp_unset_lock(&result_lock);
found_pattern = true; while (start <= stop && !found_pattern)
break; {
} // Get new value of result.
} #pragma omp flush(result)
// Make safe jump. // No chance for this thread to find first occurrence.
start += (pos_in_pattern - advances[pos_in_pattern]); if (result < start)
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern]; break;
} while (pred(begin1[start + pos_in_pattern],
} begin2[pos_in_pattern]))
{
++pos_in_pattern;
if (pos_in_pattern == pattern_length)
{
// Found new candidate for result.
omp_set_lock(&result_lock);
result = std::min(result, start);
omp_unset_lock(&result_lock);
found_pattern = true;
break;
}
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern =
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element. // Return iterator on found element.
return (begin1 + result); return (begin1 + result);
} }

View File

@ -47,28 +47,31 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
template<typename InputIterator, typename OutputIterator> template<typename InputIterator, typename OutputIterator>
inline OutputIterator inline OutputIterator
copy_tail(std::pair<InputIterator, InputIterator> b, copy_tail(std::pair<InputIterator, InputIterator> b,
std::pair<InputIterator, InputIterator> e, OutputIterator r) std::pair<InputIterator, InputIterator> e, OutputIterator r)
{ {
if (b.first != e.first) if (b.first != e.first)
{ {
do do
{ {
*r++ = *b.first++; *r++ = *b.first++;
} }
while (b.first != e.first); while (b.first != e.first);
} }
else else
{ {
while (b.second != e.second) while (b.second != e.second)
*r++ = *b.second++; *r++ = *b.second++;
} }
return r; return r;
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct symmetric_difference_func struct symmetric_difference_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
@ -80,55 +83,56 @@ namespace __gnu_parallel
Comparator comp; Comparator comp;
inline OutputIterator invoke(InputIterator a, InputIterator b, inline OutputIterator invoke(InputIterator a, InputIterator b,
InputIterator c, InputIterator d, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
++r; ++r;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
*r = *c; *r = *c;
++c; ++c;
++r; ++r;
} }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return std::copy(c, d, std::copy(a, b, r)); return std::copy(c, d, std::copy(a, b, r));
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
++a; ++a;
++counter; ++counter;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
++c; ++c;
++counter; ++counter;
} }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return counter + (b - a) + (d - c); return counter + (b - a) + (d - c);
} }
@ -144,7 +148,10 @@ namespace __gnu_parallel
}; };
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct difference_func struct difference_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
@ -157,44 +164,45 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
++r; ++r;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return std::copy(a, b, r); return std::copy(a, b, r);
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
++a; ++a;
++counter; ++counter;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ ++a; ++c; } { ++a; ++c; }
} }
return counter + (b - a); return counter + (b - a);
} }
@ -209,7 +217,10 @@ namespace __gnu_parallel
}; };
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct intersection_func struct intersection_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
@ -222,44 +233,45 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
*r = *a; *r = *a;
++a; ++a;
++c; ++c;
++r; ++r;
} }
} }
return r; return r;
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
++counter; ++counter;
} }
} }
return counter; return counter;
} }
@ -273,10 +285,11 @@ namespace __gnu_parallel
{ return out; } { return out; }
}; };
template<class InputIterator, class OutputIterator, class Comparator> template<class InputIterator, class OutputIterator, class Comparator>
struct union_func struct union_func
{ {
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::difference_type
difference_type;
union_func(Comparator c) : comp(c) {} union_func(Comparator c) : comp(c) {}
@ -284,50 +297,50 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, const InputIterator b, InputIterator c, invoke(InputIterator a, const InputIterator b, InputIterator c,
const InputIterator d, OutputIterator r) const const InputIterator d, OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
*r = *c; *r = *c;
++c; ++c;
} }
else else
{ {
*r = *a; *r = *a;
++a; ++a;
++c; ++c;
} }
++r; ++r;
} }
return std::copy(c, d, std::copy(a, b, r)); return std::copy(c, d, std::copy(a, b, r));
} }
inline difference_type inline difference_type
count(InputIterator a, const InputIterator b, InputIterator c, count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const InputIterator d) const const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
} }
++counter; ++counter;
} }
counter += (b - a); counter += (b - a);
counter += (d - c); counter += (d - c);
@ -343,11 +356,14 @@ namespace __gnu_parallel
{ return std::copy(a, b, out); } { return std::copy(a, b, out); }
}; };
template<typename InputIterator, typename OutputIterator, typename Operation> template<
typename InputIterator,
typename OutputIterator,
typename Operation>
OutputIterator OutputIterator
parallel_set_operation(InputIterator begin1, InputIterator end1, parallel_set_operation(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Operation op) OutputIterator result, Operation op)
{ {
_GLIBCXX_CALL((end1 - begin1) + (end2 - begin2)) _GLIBCXX_CALL((end1 - begin1) + (end2 - begin2))
@ -355,7 +371,6 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
typedef typename std::pair<InputIterator, InputIterator> iterator_pair; typedef typename std::pair<InputIterator, InputIterator> iterator_pair;
if (begin1 == end1) if (begin1 == end1)
return op.first_empty(begin2, end2, result); return op.first_empty(begin2, end2, result);
@ -364,152 +379,174 @@ namespace __gnu_parallel
const difference_type size = (end1 - begin1) + (end2 - begin2); const difference_type size = (end1 - begin1) + (end2 - begin2);
thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads()); const iterator_pair sequence[ 2 ] =
{ std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
difference_type borders[num_threads + 2];
equally_split(size, num_threads + 1, borders);
const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
iterator_pair block_begins[num_threads + 1];
// Very start.
block_begins[0] = std::make_pair(begin1, begin2);
difference_type length[num_threads];
OutputIterator return_value = result; OutputIterator return_value = result;
difference_type *borders;
iterator_pair *block_begins;
difference_type* lengths;
#pragma omp parallel num_threads(num_threads) thread_index_t num_threads =
{ std::min<difference_type>(get_max_threads(),
// Result from multiseq_partition. std::min(end1 - begin1, end2 - begin2));
InputIterator offset[2];
const int iam = omp_get_thread_num();
const difference_type rank = borders[iam + 1]; # pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
multiseq_partition(sequence, sequence + 2, rank, offset, op.comp); borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
block_begins = new iterator_pair[num_threads + 1];
// Very start.
block_begins[0] = std::make_pair(begin1, begin2);
lengths = new difference_type[num_threads];
} //single
// allowed to read? thread_index_t iam = omp_get_thread_num();
// together
// *(offset[ 0 ] - 1) == *offset[ 1 ]
if (offset[ 0 ] != begin1 && offset[ 1 ] != end2
&& !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ])
&& !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1)))
{
// Avoid split between globally equal elements: move one to
// front in first sequence.
--offset[ 0 ];
}
iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]); // Result from multiseq_partition.
InputIterator offset[2];
const difference_type rank = borders[iam + 1];
// Make sure all threads have their block_begin result written out. multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
#pragma omp barrier
iterator_pair block_begin = block_begins[ iam ]; // allowed to read?
// together
// *(offset[ 0 ] - 1) == *offset[ 1 ]
if (offset[ 0 ] != begin1 && offset[ 1 ] != end2
&& !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ])
&& !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1)))
{
// Avoid split between globally equal elements: move one to
// front in first sequence.
--offset[ 0 ];
}
// Begin working for the first block, while the others except iterator_pair block_end = block_begins[ iam + 1 ] =
// the last start to count. iterator_pair(offset[ 0 ], offset[ 1 ]);
if (iam == 0)
{
// The first thread can copy already.
length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result;
}
else
{
length[ iam ] = op.count(block_begin.first, block_end.first,
block_begin.second, block_end.second);
}
// Make sure everyone wrote their lengths. // Make sure all threads have their block_begin result written out.
#pragma omp barrier # pragma omp barrier
OutputIterator r = result; iterator_pair block_begin = block_begins[ iam ];
if (iam == 0) // Begin working for the first block, while the others except
{ // the last start to count.
// Do the last block. if (iam == 0)
for (int i = 0; i < num_threads; ++i) {
r += length[i]; // The first thread can copy already.
lengths[ iam ] = op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second,
result)
- result;
}
else
{
lengths[ iam ] = op.count(block_begin.first, block_end.first,
block_begin.second, block_end.second);
}
block_begin = block_begins[num_threads]; // Make sure everyone wrote their lengths.
# pragma omp barrier
// Return the result iterator of the last block. OutputIterator r = result;
return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r);
} if (iam == 0)
else {
{ // Do the last block.
for (int i = 0; i < iam; ++i) for (int i = 0; i < num_threads; ++i)
r += length[ i ]; r += lengths[i];
// Reset begins for copy pass. block_begin = block_begins[num_threads];
op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second, r); // Return the result iterator of the last block.
} return_value = op.invoke(
} block_begin.first, end1, block_begin.second, end2, r);
}
else
{
for (int i = 0; i < iam; ++i)
r += lengths[ i ];
// Reset begins for copy pass.
op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second, r);
}
}
return return_value; return return_value;
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_union(InputIterator begin1, InputIterator end1, parallel_set_union(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
union_func< InputIterator, OutputIterator, Comparator>(comp)); union_func< InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_intersection(InputIterator begin1, InputIterator end1, parallel_set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
intersection_func<InputIterator, OutputIterator, Comparator>(comp)); intersection_func<InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator> template<typename InputIterator, typename OutputIterator>
OutputIterator OutputIterator
set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result) set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result)
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
return set_intersection(begin1, end1, begin2, end2, result, return set_intersection(begin1, end1, begin2, end2, result,
std::less<value_type>()); std::less<value_type>());
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_difference(InputIterator begin1, InputIterator end1, parallel_set_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
difference_func<InputIterator, OutputIterator, Comparator>(comp)); difference_func<InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp)); symmetric_difference_func<InputIterator, OutputIterator, Comparator>
(comp));
} }
} }
#endif // _GLIBCXX_SET_ALGORITHM_ #endif // _GLIBCXX_SET_ALGORITHM_

View File

@ -44,16 +44,19 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel std::unique_copy(), without explicit equality predicate. /** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate. * @param binary_pred Equality predicate.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate> template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred) OutputIterator result, BinaryPredicate binary_pred)
{ {
_GLIBCXX_CALL(last - first) _GLIBCXX_CALL(last - first)
@ -62,126 +65,136 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type size = last - first; difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0) if (size == 0)
return result; return result;
// Let the first thread process two parts. // Let the first thread process two parts.
difference_type borders[num_threads + 2]; difference_type *counter;
__gnu_parallel::equally_split(size, num_threads + 1, borders); difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element. // First part contains at least one element.
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
int iam = omp_get_thread_num(); # pragma omp single
{
num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
equally_split(size, num_threads + 1, borders);
counter = new difference_type[num_threads + 1];
}
difference_type begin, end; thread_index_t iam = omp_get_thread_num();
// Check for length without duplicates difference_type begin, end;
// Needed for position in output
difference_type i = 0;
OutputIterator out = result;
if (iam == 0)
{
begin = borders[0] + 1; // == 1
end = borders[iam + 1];
i++; // Check for length without duplicates
new (static_cast<void *>(&*out)) value_type(*first); // Needed for position in output
out++; difference_type i = 0;
OutputIterator out = result;
for (InputIterator iter = first + begin; iter < first + end; ++iter) if (iam == 0)
{ {
if (!binary_pred(*iter, *(iter-1))) begin = borders[0] + 1; // == 1
{ end = borders[iam + 1];
i++;
new (static_cast<void *>(&*out)) value_type(*iter); i++;
out++; new (static_cast<void *>(&*out)) value_type(*first);
} out++;
}
} for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*out)) value_type(*iter);
out++;
}
}
}
else else
{ {
begin = borders[iam]; //one part begin = borders[iam]; //one part
end = borders[iam + 1]; end = borders[iam + 1];
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (!binary_pred(*iter, *(iter-1))) if (!binary_pred(*iter, *(iter-1)))
{ {
i++; i++;
} }
} }
} }
counter[iam] = i; counter[iam] = i;
// Last part still untouched. // Last part still untouched.
difference_type begin_output; difference_type begin_output;
#pragma omp barrier # pragma omp barrier
// Store result in output on calculated positions. // Store result in output on calculated positions.
begin_output = 0; begin_output = 0;
if (iam == 0) if (iam == 0)
{ {
for (int t = 0; t < num_threads; t++) for (int t = 0; t < num_threads; t++)
begin_output += counter[t]; begin_output += counter[t];
i = 0; i = 0;
OutputIterator iter_out = result + begin_output; OutputIterator iter_out = result + begin_output;
begin = borders[num_threads]; begin = borders[num_threads];
end = size; end = size;
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (iter == first || !binary_pred(*iter, *(iter-1))) if (iter == first || !binary_pred(*iter, *(iter-1)))
{ {
i++; i++;
new (static_cast<void *>(&*iter_out)) value_type(*iter); new (static_cast<void *>(&*iter_out)) value_type(*iter);
iter_out++; iter_out++;
} }
} }
counter[num_threads] = i; counter[num_threads] = i;
} }
else else
{ {
for (int t = 0; t < iam; t++) for (int t = 0; t < iam; t++)
begin_output += counter[t]; begin_output += counter[t];
OutputIterator iter_out = result + begin_output; OutputIterator iter_out = result + begin_output;
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (!binary_pred(*iter, *(iter-1))) if (!binary_pred(*iter, *(iter-1)))
{ {
new (static_cast<void *> (&*iter_out)) value_type(*iter); new (static_cast<void *> (&*iter_out)) value_type(*iter);
iter_out++; iter_out++;
} }
} }
} }
} }
difference_type end_output = 0; difference_type end_output = 0;
for (int t = 0; t < num_threads + 1; t++) for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t]; end_output += counter[t];
delete[] borders;
return result + end_output; return result + end_output;
} }
/** @brief Parallel std::unique_copy(), without explicit equality predicate /** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator> template<typename InputIterator, class OutputIterator>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result) OutputIterator result)
{ {
typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::value_type value_type;

View File

@ -55,8 +55,8 @@ namespace __gnu_parallel
#define _GLIBCXX_JOB_VOLATILE volatile #define _GLIBCXX_JOB_VOLATILE volatile
/** @brief One job for a certain thread. */ /** @brief One job for a certain thread. */
template<typename _DifferenceTp> template<typename _DifferenceTp>
struct Job struct Job
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
@ -78,31 +78,38 @@ namespace __gnu_parallel
_GLIBCXX_JOB_VOLATILE difference_type load; _GLIBCXX_JOB_VOLATILE difference_type load;
}; };
/** @brief Work stealing algorithm for random access iterators. /** @brief Work stealing algorithm for random access iterators.
* *
* Uses O(1) additional memory. Synchronization at job lists is * Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations. * done with atomic operations.
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param op User-supplied functor (comparator, predicate, adding * @param op User-supplied functor (comparator, predicate, adding
* functor, ...). * functor, ...).
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_workstealing(RandomAccessIterator begin, for_each_template_random_access_workstealing(
RandomAccessIterator end, RandomAccessIterator begin,
Op op, Fu& f, Red r, RandomAccessIterator end,
Result base, Result& output, Op op, Fu& f, Red r,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type
bound)
{ {
_GLIBCXX_CALL(end - begin) _GLIBCXX_CALL(end - begin)
@ -110,182 +117,187 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size); difference_type chunk_size =
static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs? // How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound; difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line. // To avoid false sharing in a cache line.
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1; const int stride =
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working. // Total number of threads currently working.
thread_index_t busy = 0; thread_index_t busy = 0;
thread_index_t num_threads = get_max_threads();
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin; Job<difference_type> *job;
omp_lock_t output_lock; omp_lock_t output_lock;
omp_init_lock(&output_lock); omp_init_lock(&output_lock);
// No more threads than jobs, at least one thread.
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
num_threads = static_cast<thread_index_t>(num_threads_max);
// Create job description array.
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
// Write base value to output. // Write base value to output.
output = base; output = base;
#pragma omp parallel shared(busy) num_threads(num_threads) // No more threads than jobs, at least one thread.
{ thread_index_t num_threads =
// Initialization phase. __gnu_parallel::max<thread_index_t>(1,
__gnu_parallel::min<difference_type>(length, get_max_threads()));
// Flags for every thread if it is doing productive work. # pragma omp parallel shared(busy) num_threads(num_threads)
bool iam_working = false; {
// Thread id. # pragma omp single
thread_index_t iam = omp_get_thread_num(); {
num_threads = omp_get_num_threads();
// This job. // Create job description array.
Job<difference_type>& my_job = job[iam * stride]; job = new Job<difference_type>[num_threads * stride];
}
// Random number (for work stealing). // Initialization phase.
thread_index_t victim;
// Local value for reduction. // Flags for every thread if it is doing productive work.
Result result = Result(); bool iam_working = false;
// Number of elements to steal in one attempt. // Thread id.
difference_type steal; thread_index_t iam = omp_get_thread_num();
// Every thread has its own random number generator (modulo num_threads). // This job.
random_number rand_gen(iam, num_threads); Job<difference_type>& my_job = job[iam * stride];
#pragma omp atomic // Random number (for work stealing).
// This thread is currently working. thread_index_t victim;
busy++;
iam_working = true; // Local value for reduction.
Result result = Result();
// How many jobs per thread? last thread gets the rest. // Number of elements to steal in one attempt.
my_job.first = static_cast<difference_type>(iam * (length / num_threads)); difference_type steal;
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1); // Every thread has its own random number generator
my_job.load = my_job.last - my_job.first + 1; // (modulo num_threads).
random_number rand_gen(iam, num_threads);
// Init result with first value (to have a base value for reduction). // This thread is currently working.
if (my_job.first <= my_job.last) # pragma omp atomic
{ busy++;
// Cannot use volatile variable directly.
difference_type my_first = my_job.first;
result = f(op, begin + my_first);
my_job.first++;
my_job.load--;
}
RandomAccessIterator current; iam_working = true;
#pragma omp barrier // How many jobs per thread? last thread gets the rest.
my_job.first =
static_cast<difference_type>(iam * (length / num_threads));
// Actual work phase my_job.last = (iam == (num_threads - 1)) ?
// Work on own or stolen start (length - 1) : ((iam + 1) * (length / num_threads) - 1);
while (busy > 0) my_job.load = my_job.last - my_job.first + 1;
{
// Work until no productive thread left.
#pragma omp flush(busy)
// Thread has own work to do // Init result with first value (to have a base value for reduction).
while (my_job.first <= my_job.last) if (my_job.first <= my_job.last)
{ {
// fetch-and-add call // Cannot use volatile variable directly.
// Reserve current job block (size chunk_size) in my queue. difference_type my_first = my_job.first;
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size); result = f(op, begin + my_first);
my_job.first++;
my_job.load--;
}
// Update load, to make the three values consistent, RandomAccessIterator current;
// first might have been changed in the meantime
my_job.load = my_job.last - my_job.first + 1;
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
{
// Yes: process it!
current = begin + current_job;
current_job++;
// Do actual work. # pragma omp barrier
result = r(result, f(op, current));
}
#pragma omp flush(busy) // Actual work phase
// Work on own or stolen start
while (busy > 0)
{
// Work until no productive thread left.
# pragma omp flush(busy)
} // Thread has own work to do
while (my_job.first <= my_job.last)
{
// fetch-and-add call
// Reserve current job block (size chunk_size) in my queue.
difference_type current_job =
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// After reaching this point, a thread's job list is empty. // Update load, to make the three values consistent,
if (iam_working) // first might have been changed in the meantime
{ my_job.load = my_job.last - my_job.first + 1;
#pragma omp atomic for (difference_type job_counter = 0;
// This thread no longer has work. job_counter < chunk_size && current_job <= my_job.last;
busy--; job_counter++)
{
// Yes: process it!
current = begin + current_job;
current_job++;
iam_working = false; // Do actual work.
} result = r(result, f(op, current));
}
difference_type supposed_first, supposed_last, supposed_load; # pragma omp flush(busy)
do }
{
// Find random nonempty deque (not own) and do consistency check.
yield();
#pragma omp flush(busy)
victim = rand_gen();
supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load;
}
while (busy > 0
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
if (busy == 0) // After reaching this point, a thread's job list is empty.
break; if (iam_working)
{
// This thread no longer has work.
# pragma omp atomic
busy--;
if (supposed_load > 0) iam_working = false;
{ }
// Has work and work to do.
// Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
// Protects against stealing threads difference_type supposed_first, supposed_last, supposed_load;
// omp_set_lock(&(job[victim * stride].lock)); do
{
// Find random nonempty deque (not own), do consistency check.
yield();
# pragma omp flush(busy)
victim = rand_gen();
supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load;
}
while (busy > 0
&& ((supposed_load <= 0)
|| ((supposed_first + supposed_load - 1) != supposed_last)));
// Push victim's start forward. if (busy == 0)
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal); break;
difference_type stolen_try = stolen_first + steal - difference_type(1);
// Protects against working thread if (supposed_load > 0)
// omp_unset_lock(&(job[victim * stride].lock)); {
// Has work and work to do.
// Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
my_job.first = stolen_first; // Push victim's start forward.
difference_type stolen_first =
// Avoid std::min dependencies. fetch_and_add<difference_type>(
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last; &(job[victim * stride].first), steal);
difference_type stolen_try =
stolen_first + steal - difference_type(1);
my_job.load = my_job.last - my_job.first + 1; my_job.first = stolen_first;
my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
my_job.load = my_job.last - my_job.first + 1;
//omp_unset_lock(&(my_job.lock)); // Has potential work again.
# pragma omp atomic
busy++;
iam_working = true;
#pragma omp atomic # pragma omp flush(busy)
// Has potential work again. }
busy++; # pragma omp flush(busy)
iam_working = true; } // end while busy > 0
// Add accumulated result to output.
#pragma omp flush(busy) omp_set_lock(&output_lock);
} output = r(output, result);
#pragma omp flush(busy) omp_unset_lock(&output_lock);
} // end while busy > 0 }
// Add accumulated result to output.
omp_set_lock(&output_lock);
output = r(output, result);
omp_unset_lock(&output_lock);
//omp_destroy_lock(&(my_job.lock));
}
delete[] job; delete[] job;