mirror of git://gcc.gnu.org/git/gcc.git
re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
From-SVN: r130347
This commit is contained in:
parent
7861a5ce14
commit
e683ee2a20
|
|
@ -1,3 +1,27 @@
|
||||||
|
2007-11-22 Johannes Singler <singler@ira.uka.de>
|
||||||
|
|
||||||
|
PR libstdc++/33893
|
||||||
|
* include/parallel/multiway_merge.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/workstealing.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/base.h: infrastructure, cleanup
|
||||||
|
* include/parallel/par_loop.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/features.h: activate loser tree variant
|
||||||
|
* include/parallel/quicksort.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/compiletime_settings.h: settings overridable
|
||||||
|
* include/parallel/equally_split.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/random_shuffle.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/set_operations.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/unique_copy.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/search.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/partition.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/partial_sum.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/find.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/omp_loop.h: made omp_dynamic-safe
|
||||||
|
* include/parallel/losertree.h: avoid default constructor
|
||||||
|
|
||||||
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
|
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
|
||||||
|
|
||||||
* docs/html/17_intro/C++STYLE: Fix typos.
|
* docs/html/17_intro/C++STYLE: Fix typos.
|
||||||
|
|
|
||||||
|
|
@ -63,15 +63,15 @@
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/** @brief Information local to one thread in the parallel quicksort run. */
|
/** @brief Information local to one thread in the parallel quicksort run. */
|
||||||
template<typename RandomAccessIterator>
|
template<typename RandomAccessIterator>
|
||||||
struct QSBThreadLocal
|
struct QSBThreadLocal
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
/** @brief Continuous part of the sequence, described by an
|
/** @brief Continuous part of the sequence, described by an
|
||||||
iterator pair. */
|
iterator pair. */
|
||||||
typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece;
|
typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece;
|
||||||
|
|
||||||
/** @brief Initial piece to work on. */
|
/** @brief Initial piece to work on. */
|
||||||
|
|
@ -94,29 +94,17 @@ namespace __gnu_parallel
|
||||||
QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
|
QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Initialize the thread local storage.
|
/** @brief Balanced quicksort divide step.
|
||||||
* @param tls Array of thread-local storages.
|
* @param begin Begin iterator of subsequence.
|
||||||
* @param queue_size Size of the work-stealing queue. */
|
* @param end End iterator of subsequence.
|
||||||
template<typename RandomAccessIterator>
|
* @param comp Comparator.
|
||||||
inline void
|
* @param num_threads Number of threads that are allowed to work on
|
||||||
qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size)
|
* this part.
|
||||||
{
|
* @pre @c (end-begin)>=1 */
|
||||||
int iam = omp_get_thread_num();
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** @brief Balanced quicksort divide step.
|
|
||||||
* @param begin Begin iterator of subsequence.
|
|
||||||
* @param end End iterator of subsequence.
|
|
||||||
* @param comp Comparator.
|
|
||||||
* @param num_threads Number of threads that are allowed to work on
|
|
||||||
* this part.
|
|
||||||
* @pre @c (end-begin)>=1 */
|
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
|
||||||
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
|
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
|
qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
|
||||||
Comparator comp, int num_threads)
|
Comparator comp, thread_index_t num_threads)
|
||||||
{
|
{
|
||||||
_GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
|
_GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
|
||||||
|
|
||||||
|
|
@ -124,18 +112,20 @@ namespace __gnu_parallel
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp);
|
RandomAccessIterator pivot_pos = median_of_three_iterators(
|
||||||
|
begin, begin + (end - begin) / 2, end - 1, comp);
|
||||||
|
|
||||||
#if defined(_GLIBCXX_ASSERTIONS)
|
#if defined(_GLIBCXX_ASSERTIONS)
|
||||||
// Must be in between somewhere.
|
// Must be in between somewhere.
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
|
|
||||||
_GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
|
_GLIBCXX_PARALLEL_ASSERT(
|
||||||
|| (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
|
(!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
|
||||||
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
|
|| (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
|
||||||
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
|
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
|
||||||
|| (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
|
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
|
||||||
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
|
|| (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
|
||||||
|
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Swap pivot value to end.
|
// Swap pivot value to end.
|
||||||
|
|
@ -143,10 +133,12 @@ namespace __gnu_parallel
|
||||||
std::swap(*pivot_pos, *(end - 1));
|
std::swap(*pivot_pos, *(end - 1));
|
||||||
pivot_pos = end - 1;
|
pivot_pos = end - 1;
|
||||||
|
|
||||||
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
|
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
|
||||||
|
pred(comp, *pivot_pos);
|
||||||
|
|
||||||
// Divide, returning end - begin - 1 in the worst case.
|
// Divide, returning end - begin - 1 in the worst case.
|
||||||
difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads);
|
difference_type split_pos = parallel_partition(
|
||||||
|
begin, end - 1, pred, num_threads);
|
||||||
|
|
||||||
// Swap back pivot to middle.
|
// Swap back pivot to middle.
|
||||||
std::swap(*(begin + split_pos), *pivot_pos);
|
std::swap(*(begin + split_pos), *pivot_pos);
|
||||||
|
|
@ -163,18 +155,21 @@ namespace __gnu_parallel
|
||||||
return split_pos;
|
return split_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Quicksort conquer step.
|
/** @brief Quicksort conquer step.
|
||||||
* @param tls Array of thread-local storages.
|
* @param tls Array of thread-local storages.
|
||||||
* @param begin Begin iterator of subsequence.
|
* @param begin Begin iterator of subsequence.
|
||||||
* @param end End iterator of subsequence.
|
* @param end End iterator of subsequence.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
* @param iam Number of the thread processing this function.
|
* @param iam Number of the thread processing this function.
|
||||||
* @param num_threads Number of threads that are allowed to work on this part. */
|
* @param num_threads
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
* Number of threads that are allowed to work on this part. */
|
||||||
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
|
qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
|
||||||
RandomAccessIterator begin, RandomAccessIterator end,
|
RandomAccessIterator begin, RandomAccessIterator end,
|
||||||
Comparator comp, thread_index_t iam, thread_index_t num_threads)
|
Comparator comp,
|
||||||
|
thread_index_t iam, thread_index_t num_threads,
|
||||||
|
bool parent_wait)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -182,14 +177,14 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
|
|
||||||
if (num_threads <= 1 || n < 2)
|
if (num_threads <= 1 || n <= 1)
|
||||||
{
|
{
|
||||||
tls[iam]->initial.first = begin;
|
tls[iam]->initial.first = begin;
|
||||||
tls[iam]->initial.second = end;
|
tls[iam]->initial.second = end;
|
||||||
|
|
||||||
qsb_local_sort_with_helping(tls, comp, iam);
|
qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Divide step.
|
// Divide step.
|
||||||
|
|
@ -199,33 +194,55 @@ namespace __gnu_parallel
|
||||||
_GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
|
_GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n));
|
thread_index_t num_threads_leftside =
|
||||||
|
std::max<thread_index_t>(1, std::min<thread_index_t>(
|
||||||
|
num_threads - 1, split_pos * num_threads / n));
|
||||||
|
|
||||||
#pragma omp atomic
|
# pragma omp atomic
|
||||||
*tls[iam]->elements_leftover -= (difference_type)1;
|
*tls[iam]->elements_leftover -= (difference_type)1;
|
||||||
|
|
||||||
// Conquer step.
|
// Conquer step.
|
||||||
#pragma omp parallel sections num_threads(2)
|
# pragma omp parallel num_threads(2)
|
||||||
{
|
{
|
||||||
#pragma omp section
|
bool wait;
|
||||||
qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside);
|
if(omp_get_num_threads() < 2)
|
||||||
// The pivot_pos is left in place, to ensure termination.
|
wait = false;
|
||||||
#pragma omp section
|
else
|
||||||
qsb_conquer(tls, begin + split_pos + 1, end, comp,
|
wait = parent_wait;
|
||||||
iam + num_threads_leftside, num_threads - num_threads_leftside);
|
|
||||||
|
# pragma omp sections
|
||||||
|
{
|
||||||
|
# pragma omp section
|
||||||
|
{
|
||||||
|
qsb_conquer(tls, begin, begin + split_pos, comp,
|
||||||
|
iam,
|
||||||
|
num_threads_leftside,
|
||||||
|
wait);
|
||||||
|
wait = parent_wait;
|
||||||
|
}
|
||||||
|
// The pivot_pos is left in place, to ensure termination.
|
||||||
|
# pragma omp section
|
||||||
|
{
|
||||||
|
qsb_conquer(tls, begin + split_pos + 1, end, comp,
|
||||||
|
iam + num_threads_leftside,
|
||||||
|
num_threads - num_threads_leftside,
|
||||||
|
wait);
|
||||||
|
wait = parent_wait;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Quicksort step doing load-balanced local sort.
|
* @brief Quicksort step doing load-balanced local sort.
|
||||||
* @param tls Array of thread-local storages.
|
* @param tls Array of thread-local storages.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
* @param iam Number of the thread processing this function.
|
* @param iam Number of the thread processing this function.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
|
qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
|
||||||
Comparator& comp, int iam)
|
Comparator& comp, int iam, bool wait)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -251,151 +268,162 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
// Invariant: current must be a valid (maybe empty) range.
|
// Invariant: current must be a valid (maybe empty) range.
|
||||||
RandomAccessIterator begin = current.first, end = current.second;
|
RandomAccessIterator begin = current.first, end = current.second;
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
|
|
||||||
if (n > base_case_n)
|
if (n > base_case_n)
|
||||||
{
|
{
|
||||||
// Divide.
|
// Divide.
|
||||||
RandomAccessIterator pivot_pos = begin + rng(n);
|
RandomAccessIterator pivot_pos = begin + rng(n);
|
||||||
|
|
||||||
// Swap pivot_pos value to end.
|
// Swap pivot_pos value to end.
|
||||||
if (pivot_pos != (end - 1))
|
if (pivot_pos != (end - 1))
|
||||||
std::swap(*pivot_pos, *(end - 1));
|
std::swap(*pivot_pos, *(end - 1));
|
||||||
pivot_pos = end - 1;
|
pivot_pos = end - 1;
|
||||||
|
|
||||||
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
|
__gnu_parallel::binder2nd
|
||||||
|
<Comparator, value_type, value_type, bool>
|
||||||
|
pred(comp, *pivot_pos);
|
||||||
|
|
||||||
// Divide, leave pivot unchanged in last place.
|
// Divide, leave pivot unchanged in last place.
|
||||||
RandomAccessIterator split_pos1, split_pos2;
|
RandomAccessIterator split_pos1, split_pos2;
|
||||||
split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
|
split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
|
||||||
|
|
||||||
// Left side: < pivot_pos; right side: >= pivot_pos.
|
// Left side: < pivot_pos; right side: >= pivot_pos.
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
_GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
|
_GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
|
||||||
#endif
|
#endif
|
||||||
// Swap pivot back to middle.
|
// Swap pivot back to middle.
|
||||||
if (split_pos1 != pivot_pos)
|
if (split_pos1 != pivot_pos)
|
||||||
std::swap(*split_pos1, *pivot_pos);
|
std::swap(*split_pos1, *pivot_pos);
|
||||||
pivot_pos = split_pos1;
|
pivot_pos = split_pos1;
|
||||||
|
|
||||||
// In case all elements are equal, split_pos1 == 0.
|
// In case all elements are equal, split_pos1 == 0.
|
||||||
if ((split_pos1 + 1 - begin) < (n >> 7)
|
if ((split_pos1 + 1 - begin) < (n >> 7)
|
||||||
|| (end - split_pos1) < (n >> 7))
|
|| (end - split_pos1) < (n >> 7))
|
||||||
{
|
{
|
||||||
// Very unequal split, one part smaller than one 128th
|
// Very unequal split, one part smaller than one 128th
|
||||||
// elements not strictly larger than the pivot.
|
// elements not strictly larger than the pivot.
|
||||||
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
|
__gnu_parallel::unary_negate<__gnu_parallel::binder1st
|
||||||
|
<Comparator, value_type, value_type, bool>, value_type>
|
||||||
|
pred(__gnu_parallel::binder1st
|
||||||
|
<Comparator, value_type, value_type, bool>(
|
||||||
|
comp, *pivot_pos));
|
||||||
|
|
||||||
// Find other end of pivot-equal range.
|
// Find other end of pivot-equal range.
|
||||||
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
|
split_pos2 = __gnu_sequential::partition(
|
||||||
}
|
split_pos1 + 1, end, pred);
|
||||||
else
|
}
|
||||||
{
|
else
|
||||||
// Only skip the pivot.
|
// Only skip the pivot.
|
||||||
split_pos2 = split_pos1 + 1;
|
split_pos2 = split_pos1 + 1;
|
||||||
}
|
|
||||||
|
|
||||||
// Elements equal to pivot are done.
|
// Elements equal to pivot are done.
|
||||||
elements_done += (split_pos2 - split_pos1);
|
elements_done += (split_pos2 - split_pos1);
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
total_elements_done += (split_pos2 - split_pos1);
|
total_elements_done += (split_pos2 - split_pos1);
|
||||||
#endif
|
#endif
|
||||||
// Always push larger part onto stack.
|
// Always push larger part onto stack.
|
||||||
if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
|
if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
|
||||||
{
|
{
|
||||||
// Right side larger.
|
// Right side larger.
|
||||||
if ((split_pos2) != end)
|
if ((split_pos2) != end)
|
||||||
tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
|
tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
|
||||||
|
|
||||||
//current.first = begin; //already set anyway
|
//current.first = begin; //already set anyway
|
||||||
current.second = split_pos1;
|
current.second = split_pos1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Left side larger.
|
// Left side larger.
|
||||||
if (begin != split_pos1)
|
if (begin != split_pos1)
|
||||||
tl.leftover_parts.push_front(std::make_pair(begin, split_pos1));
|
tl.leftover_parts.push_front(
|
||||||
|
std::make_pair(begin, split_pos1));
|
||||||
|
|
||||||
current.first = split_pos2;
|
current.first = split_pos2;
|
||||||
//current.second = end; //already set anyway
|
//current.second = end; //already set anyway
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
__gnu_sequential::sort(begin, end, comp);
|
__gnu_sequential::sort(begin, end, comp);
|
||||||
elements_done += n;
|
elements_done += n;
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
total_elements_done += n;
|
total_elements_done += n;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Prefer own stack, small pieces.
|
// Prefer own stack, small pieces.
|
||||||
if (tl.leftover_parts.pop_front(current))
|
if (tl.leftover_parts.pop_front(current))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
#pragma omp atomic
|
# pragma omp atomic
|
||||||
*tl.elements_leftover -= elements_done;
|
*tl.elements_leftover -= elements_done;
|
||||||
elements_done = 0;
|
|
||||||
|
elements_done = 0;
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
double search_start = omp_get_wtime();
|
double search_start = omp_get_wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Look for new work.
|
// Look for new work.
|
||||||
bool success = false;
|
bool successfully_stolen = false;
|
||||||
while (*tl.elements_leftover > 0 && !success
|
while (wait && *tl.elements_leftover > 0 && !successfully_stolen
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
// Possible dead-lock.
|
// Possible dead-lock.
|
||||||
&& (omp_get_wtime() < (search_start + 1.0))
|
&& (omp_get_wtime() < (search_start + 1.0))
|
||||||
#endif
|
#endif
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
thread_index_t victim;
|
thread_index_t victim;
|
||||||
victim = rng(num_threads);
|
victim = rng(num_threads);
|
||||||
|
|
||||||
// Large pieces.
|
// Large pieces.
|
||||||
success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current);
|
successfully_stolen = (victim != iam)
|
||||||
if (!success)
|
&& tls[victim]->leftover_parts.pop_back(current);
|
||||||
yield();
|
if (!successfully_stolen)
|
||||||
|
yield();
|
||||||
#if !defined(__ICC) && !defined(__ECC)
|
#if !defined(__ICC) && !defined(__ECC)
|
||||||
#pragma omp flush
|
# pragma omp flush
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
if (omp_get_wtime() >= (search_start + 1.0))
|
if (omp_get_wtime() >= (search_start + 1.0))
|
||||||
{
|
{
|
||||||
sleep(1);
|
sleep(1);
|
||||||
_GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0));
|
_GLIBCXX_PARALLEL_ASSERT(
|
||||||
}
|
omp_get_wtime() < (search_start + 1.0));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!success)
|
if (!successfully_stolen)
|
||||||
{
|
{
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
_GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
|
_GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Top-level quicksort routine.
|
/** @brief Top-level quicksort routine.
|
||||||
* @param begin Begin iterator of sequence.
|
* @param begin Begin iterator of sequence.
|
||||||
* @param end End iterator of sequence.
|
* @param end End iterator of sequence.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
* @param n Length of the sequence to sort.
|
* @param n Length of the sequence to sort.
|
||||||
* @param num_threads Number of threads that are allowed to work on
|
* @param num_threads Number of threads that are allowed to work on
|
||||||
* this part.
|
* this part.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
|
parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
|
||||||
Comparator comp,
|
Comparator comp,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
|
typename std::iterator_traits<RandomAccessIterator>
|
||||||
|
::difference_type n,
|
||||||
|
thread_index_t num_threads)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(end - begin)
|
_GLIBCXX_CALL(end - begin)
|
||||||
|
|
||||||
|
|
@ -413,11 +441,11 @@ namespace __gnu_parallel
|
||||||
if (num_threads > n)
|
if (num_threads > n)
|
||||||
num_threads = static_cast<thread_index_t>(n);
|
num_threads = static_cast<thread_index_t>(n);
|
||||||
|
|
||||||
|
// Initialize thread local storage
|
||||||
tls_type** tls = new tls_type*[num_threads];
|
tls_type** tls = new tls_type*[num_threads];
|
||||||
|
difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
|
||||||
#pragma omp parallel num_threads(num_threads)
|
for (thread_index_t t = 0; t < num_threads; ++t)
|
||||||
// Initialize variables per processor.
|
tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
|
||||||
qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
|
|
||||||
|
|
||||||
// There can never be more than ceil(log2(n)) ranges on the stack, because
|
// There can never be more than ceil(log2(n)) ranges on the stack, because
|
||||||
// 1. Only one processor pushes onto the stack
|
// 1. Only one processor pushes onto the stack
|
||||||
|
|
@ -426,22 +454,16 @@ namespace __gnu_parallel
|
||||||
volatile difference_type elements_leftover = n;
|
volatile difference_type elements_leftover = n;
|
||||||
for (int i = 0; i < num_threads; i++)
|
for (int i = 0; i < num_threads; i++)
|
||||||
{
|
{
|
||||||
tls[i]->elements_leftover = &elements_leftover;
|
tls[i]->elements_leftover = &elements_leftover;
|
||||||
tls[i]->num_threads = num_threads;
|
tls[i]->num_threads = num_threads;
|
||||||
tls[i]->global = std::make_pair(begin, end);
|
tls[i]->global = std::make_pair(begin, end);
|
||||||
|
|
||||||
// Just in case nothing is left to assign.
|
// Just in case nothing is left to assign.
|
||||||
tls[i]->initial = std::make_pair(end, end);
|
tls[i]->initial = std::make_pair(end, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initial splitting, recursively.
|
|
||||||
int old_nested = omp_get_nested();
|
|
||||||
omp_set_nested(true);
|
|
||||||
|
|
||||||
// Main recursion call.
|
// Main recursion call.
|
||||||
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads);
|
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
|
||||||
|
|
||||||
omp_set_nested(old_nested);
|
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
// All stack must be empty.
|
// All stack must be empty.
|
||||||
|
|
|
||||||
|
|
@ -49,54 +49,70 @@ namespace __gnu_parallel
|
||||||
// XXX remove std::duplicates from here if possible,
|
// XXX remove std::duplicates from here if possible,
|
||||||
// XXX but keep minimal dependencies.
|
// XXX but keep minimal dependencies.
|
||||||
|
|
||||||
/** @brief Calculates the rounded-down logarithm of @c n for base 2.
|
/** @brief Calculates the rounded-down logarithm of @c n for base 2.
|
||||||
* @param n Argument.
|
* @param n Argument.
|
||||||
* @return Returns 0 for argument 0.
|
* @return Returns 0 for argument 0.
|
||||||
*/
|
*/
|
||||||
template<typename Size>
|
template<typename Size>
|
||||||
inline Size
|
inline Size
|
||||||
log2(Size n)
|
log2(Size n)
|
||||||
{
|
{
|
||||||
Size k;
|
Size k;
|
||||||
for (k = 0; n != 1; n >>= 1)
|
for (k = 0; n != 1; n >>= 1)
|
||||||
++k;
|
++k;
|
||||||
return k;
|
return k;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Encode two integers into one __gnu_parallel::lcas_t.
|
/** @brief Encode two integers into one __gnu_parallel::lcas_t.
|
||||||
* @param a First integer, to be encoded in the most-significant @c
|
* @param a First integer, to be encoded in the most-significant @c
|
||||||
* lcas_t_bits/2 bits.
|
* lcas_t_bits/2 bits.
|
||||||
* @param b Second integer, to be encoded in the least-significant
|
* @param b Second integer, to be encoded in the least-significant
|
||||||
* @c lcas_t_bits/2 bits.
|
* @c lcas_t_bits/2 bits.
|
||||||
* @return __gnu_parallel::lcas_t value encoding @c a and @c b.
|
* @return __gnu_parallel::lcas_t value encoding @c a and @c b.
|
||||||
* @see decode2
|
* @see decode2
|
||||||
*/
|
*/
|
||||||
inline lcas_t
|
inline lcas_t
|
||||||
encode2(int a, int b) //must all be non-negative, actually
|
encode2(int a, int b) //must all be non-negative, actually
|
||||||
{
|
{
|
||||||
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
|
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Decode two integers from one __gnu_parallel::lcas_t.
|
/** @brief Decode two integers from one __gnu_parallel::lcas_t.
|
||||||
* @param x __gnu_parallel::lcas_t to decode integers from.
|
* @param x __gnu_parallel::lcas_t to decode integers from.
|
||||||
* @param a First integer, to be decoded from the most-significant
|
* @param a First integer, to be decoded from the most-significant
|
||||||
* @c lcas_t_bits/2 bits of @c x.
|
* @c lcas_t_bits/2 bits of @c x.
|
||||||
* @param b Second integer, to be encoded in the least-significant
|
* @param b Second integer, to be encoded in the least-significant
|
||||||
* @c lcas_t_bits/2 bits of @c x.
|
* @c lcas_t_bits/2 bits of @c x.
|
||||||
* @see encode2
|
* @see encode2
|
||||||
*/
|
*/
|
||||||
inline void
|
inline void
|
||||||
decode2(lcas_t x, int& a, int& b)
|
decode2(lcas_t x, int& a, int& b)
|
||||||
{
|
{
|
||||||
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
|
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
|
||||||
b = (int)((x >> 0 ) & lcas_t_mask);
|
b = (int)((x >> 0 ) & lcas_t_mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Constructs predicate for equality from strict weak
|
/** @brief Equivalent to std::min. */
|
||||||
* ordering predicate
|
template<typename T>
|
||||||
*/
|
const T&
|
||||||
// XXX comparator at the end, as per others
|
min(const T& a, const T& b)
|
||||||
template<typename Comparator, typename T1, typename T2>
|
{
|
||||||
|
return (a < b) ? a : b;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** @brief Equivalent to std::max. */
|
||||||
|
template<typename T>
|
||||||
|
const T&
|
||||||
|
max(const T& a, const T& b)
|
||||||
|
{
|
||||||
|
return (a > b) ? a : b;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** @brief Constructs predicate for equality from strict weak
|
||||||
|
* ordering predicate
|
||||||
|
*/
|
||||||
|
// XXX comparator at the end, as per others
|
||||||
|
template<typename Comparator, typename T1, typename T2>
|
||||||
class equal_from_less : public std::binary_function<T1, T2, bool>
|
class equal_from_less : public std::binary_function<T1, T2, bool>
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
|
|
@ -112,162 +128,176 @@ namespace __gnu_parallel
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */
|
/** @brief Similar to std::binder1st,
|
||||||
template<typename _Predicate, typename argument_type>
|
* but giving the argument types explicitly. */
|
||||||
class unary_negate
|
template<typename _Predicate, typename argument_type>
|
||||||
: public std::unary_function<argument_type, bool>
|
class unary_negate
|
||||||
{
|
: public std::unary_function<argument_type, bool>
|
||||||
protected:
|
{
|
||||||
_Predicate _M_pred;
|
protected:
|
||||||
|
_Predicate _M_pred;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit
|
explicit
|
||||||
unary_negate(const _Predicate& __x) : _M_pred(__x) { }
|
unary_negate(const _Predicate& __x) : _M_pred(__x) { }
|
||||||
|
|
||||||
bool
|
bool
|
||||||
operator()(const argument_type& __x)
|
operator()(const argument_type& __x)
|
||||||
{ return !_M_pred(__x); }
|
{ return !_M_pred(__x); }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */
|
/** @brief Similar to std::binder1st,
|
||||||
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
|
* but giving the argument types explicitly. */
|
||||||
class binder1st
|
template<
|
||||||
: public std::unary_function<second_argument_type, result_type>
|
typename _Operation,
|
||||||
{
|
typename first_argument_type,
|
||||||
protected:
|
typename second_argument_type,
|
||||||
_Operation op;
|
typename result_type>
|
||||||
first_argument_type value;
|
class binder1st
|
||||||
|
: public std::unary_function<second_argument_type, result_type>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
_Operation op;
|
||||||
|
first_argument_type value;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
binder1st(const _Operation& __x,
|
binder1st(const _Operation& __x,
|
||||||
const first_argument_type& __y)
|
const first_argument_type& __y)
|
||||||
: op(__x), value(__y) { }
|
: op(__x), value(__y) { }
|
||||||
|
|
||||||
result_type
|
result_type
|
||||||
operator()(const second_argument_type& __x)
|
operator()(const second_argument_type& __x)
|
||||||
{ return op(value, __x); }
|
{ return op(value, __x); }
|
||||||
|
|
||||||
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
||||||
// 109. Missing binders for non-const sequence elements
|
// 109. Missing binders for non-const sequence elements
|
||||||
result_type
|
result_type
|
||||||
operator()(second_argument_type& __x) const
|
operator()(second_argument_type& __x) const
|
||||||
{ return op(value, __x); }
|
{ return op(value, __x); }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Similar to std::binder2nd, but giving the argument types
|
* @brief Similar to std::binder2nd, but giving the argument types
|
||||||
* explicitly.
|
* explicitly.
|
||||||
*/
|
*/
|
||||||
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
|
template<
|
||||||
class binder2nd
|
typename _Operation,
|
||||||
: public std::unary_function<first_argument_type, result_type>
|
typename first_argument_type,
|
||||||
{
|
typename second_argument_type,
|
||||||
protected:
|
typename result_type>
|
||||||
_Operation op;
|
class binder2nd
|
||||||
second_argument_type value;
|
: public std::unary_function<first_argument_type, result_type>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
_Operation op;
|
||||||
|
second_argument_type value;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
binder2nd(const _Operation& __x,
|
binder2nd(const _Operation& __x,
|
||||||
const second_argument_type& __y)
|
const second_argument_type& __y)
|
||||||
: op(__x), value(__y) { }
|
: op(__x), value(__y) { }
|
||||||
|
|
||||||
result_type
|
result_type
|
||||||
operator()(const first_argument_type& __x) const
|
operator()(const first_argument_type& __x) const
|
||||||
{ return op(__x, value); }
|
{ return op(__x, value); }
|
||||||
|
|
||||||
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
// _GLIBCXX_RESOLVE_LIB_DEFECTS
|
||||||
// 109. Missing binders for non-const sequence elements
|
// 109. Missing binders for non-const sequence elements
|
||||||
result_type
|
result_type
|
||||||
operator()(first_argument_type& __x)
|
operator()(first_argument_type& __x)
|
||||||
{ return op(__x, value); }
|
{ return op(__x, value); }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Similar to std::equal_to, but allows two different types. */
|
/** @brief Similar to std::equal_to, but allows two different types. */
|
||||||
template<typename T1, typename T2>
|
template<typename T1, typename T2>
|
||||||
struct equal_to : std::binary_function<T1, T2, bool>
|
struct equal_to : std::binary_function<T1, T2, bool>
|
||||||
{
|
{
|
||||||
bool operator()(const T1& t1, const T2& t2) const
|
bool operator()(const T1& t1, const T2& t2) const
|
||||||
{ return t1 == t2; }
|
{ return t1 == t2; }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Similar to std::less, but allows two different types. */
|
/** @brief Similar to std::less, but allows two different types. */
|
||||||
template<typename T1, typename T2>
|
template<typename T1, typename T2>
|
||||||
struct less : std::binary_function<T1, T2, bool>
|
struct less : std::binary_function<T1, T2, bool>
|
||||||
{
|
{
|
||||||
bool
|
bool
|
||||||
operator()(const T1& t1, const T2& t2) const
|
operator()(const T1& t1, const T2& t2) const
|
||||||
{ return t1 < t2; }
|
{ return t1 < t2; }
|
||||||
|
|
||||||
bool
|
bool
|
||||||
operator()(const T2& t2, const T1& t1) const
|
operator()(const T2& t2, const T1& t1) const
|
||||||
{ return t2 < t1; }
|
{ return t2 < t1; }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Partial specialization for one type. Same as std::less.
|
// Partial specialization for one type. Same as std::less.
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
|
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
|
||||||
{
|
{
|
||||||
bool
|
bool
|
||||||
operator()(const _Tp& __x, const _Tp& __y) const
|
operator()(const _Tp& __x, const _Tp& __y) const
|
||||||
{ return __x < __y; }
|
{ return __x < __y; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** @brief Similar to std::plus, but allows two different types. */
|
/** @brief Similar to std::plus, but allows two different types. */
|
||||||
template<typename _Tp1, typename _Tp2>
|
template<typename _Tp1, typename _Tp2>
|
||||||
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
|
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
|
||||||
{
|
{
|
||||||
typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
|
typedef typeof(*static_cast<_Tp1*>(NULL)
|
||||||
|
+ *static_cast<_Tp2*>(NULL)) result;
|
||||||
|
|
||||||
result
|
result
|
||||||
operator()(const _Tp1& __x, const _Tp2& __y) const
|
operator()(const _Tp1& __x, const _Tp2& __y) const
|
||||||
{ return __x + __y; }
|
{ return __x + __y; }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Partial specialization for one type. Same as std::plus.
|
// Partial specialization for one type. Same as std::plus.
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
|
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
|
||||||
{
|
{
|
||||||
typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
|
typedef typeof(*static_cast<_Tp*>(NULL)
|
||||||
|
+ *static_cast<_Tp*>(NULL)) result;
|
||||||
|
|
||||||
result
|
result
|
||||||
operator()(const _Tp& __x, const _Tp& __y) const
|
operator()(const _Tp& __x, const _Tp& __y) const
|
||||||
{ return __x + __y; }
|
{ return __x + __y; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** @brief Similar to std::multiplies, but allows two different types. */
|
/** @brief Similar to std::multiplies, but allows two different types. */
|
||||||
template<typename _Tp1, typename _Tp2>
|
template<typename _Tp1, typename _Tp2>
|
||||||
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
|
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
|
||||||
{
|
{
|
||||||
typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
|
typedef typeof(*static_cast<_Tp1*>(NULL)
|
||||||
|
* *static_cast<_Tp2*>(NULL)) result;
|
||||||
|
|
||||||
result
|
result
|
||||||
operator()(const _Tp1& __x, const _Tp2& __y) const
|
operator()(const _Tp1& __x, const _Tp2& __y) const
|
||||||
{ return __x * __y; }
|
{ return __x * __y; }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Partial specialization for one type. Same as std::multiplies.
|
// Partial specialization for one type. Same as std::multiplies.
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
|
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
|
||||||
{
|
{
|
||||||
typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
|
typedef typeof(*static_cast<_Tp*>(NULL)
|
||||||
|
* *static_cast<_Tp*>(NULL)) result;
|
||||||
|
|
||||||
result
|
result
|
||||||
operator()(const _Tp& __x, const _Tp& __y) const
|
operator()(const _Tp& __x, const _Tp& __y) const
|
||||||
{ return __x * __y; }
|
{ return __x * __y; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename T, typename _DifferenceTp>
|
template<typename T, typename _DifferenceTp>
|
||||||
class pseudo_sequence;
|
class pseudo_sequence;
|
||||||
|
|
||||||
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
|
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
|
||||||
* If features the usual random-access iterator functionality.
|
* If features the usual random-access iterator functionality.
|
||||||
* @param T Sequence value type.
|
* @param T Sequence value type.
|
||||||
* @param difference_type Sequence difference type.
|
* @param difference_type Sequence difference type.
|
||||||
*/
|
*/
|
||||||
template<typename T, typename _DifferenceTp>
|
template<typename T, typename _DifferenceTp>
|
||||||
class pseudo_sequence_iterator
|
class pseudo_sequence_iterator
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
@ -296,34 +326,34 @@ namespace __gnu_parallel
|
||||||
operator++(int)
|
operator++(int)
|
||||||
{ return type(pos++); }
|
{ return type(pos++); }
|
||||||
|
|
||||||
const T&
|
const T&
|
||||||
operator*() const
|
operator*() const
|
||||||
{ return val; }
|
{ return val; }
|
||||||
|
|
||||||
const T&
|
const T&
|
||||||
operator[](difference_type) const
|
operator[](difference_type) const
|
||||||
{ return val; }
|
{ return val; }
|
||||||
|
|
||||||
bool
|
bool
|
||||||
operator==(const type& i2)
|
operator==(const type& i2)
|
||||||
{ return pos == i2.pos; }
|
{ return pos == i2.pos; }
|
||||||
|
|
||||||
difference_type
|
difference_type
|
||||||
operator!=(const type& i2)
|
operator!=(const type& i2)
|
||||||
{ return pos != i2.pos; }
|
{ return pos != i2.pos; }
|
||||||
|
|
||||||
difference_type
|
difference_type
|
||||||
operator-(const type& i2)
|
operator-(const type& i2)
|
||||||
{ return pos - i2.pos; }
|
{ return pos - i2.pos; }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Sequence that conceptually consists of multiple copies of
|
/** @brief Sequence that conceptually consists of multiple copies of
|
||||||
the same element.
|
the same element.
|
||||||
* The copies are not stored explicitly, of course.
|
* The copies are not stored explicitly, of course.
|
||||||
* @param T Sequence value type.
|
* @param T Sequence value type.
|
||||||
* @param difference_type Sequence difference type.
|
* @param difference_type Sequence difference type.
|
||||||
*/
|
*/
|
||||||
template<typename T, typename _DifferenceTp>
|
template<typename T, typename _DifferenceTp>
|
||||||
class pseudo_sequence
|
class pseudo_sequence
|
||||||
{
|
{
|
||||||
typedef pseudo_sequence<T, _DifferenceTp> type;
|
typedef pseudo_sequence<T, _DifferenceTp> type;
|
||||||
|
|
@ -335,10 +365,10 @@ namespace __gnu_parallel
|
||||||
typedef pseudo_sequence_iterator<T, uint64> iterator;
|
typedef pseudo_sequence_iterator<T, uint64> iterator;
|
||||||
|
|
||||||
/** @brief Constructor.
|
/** @brief Constructor.
|
||||||
* @param val Element of the sequence.
|
* @param val Element of the sequence.
|
||||||
* @param count Number of (virtual) copies.
|
* @param count Number of (virtual) copies.
|
||||||
*/
|
*/
|
||||||
pseudo_sequence(const T& val, difference_type count)
|
pseudo_sequence(const T& val, difference_type count)
|
||||||
: val(val), count(count) { }
|
: val(val), count(count) { }
|
||||||
|
|
||||||
/** @brief Begin iterator. */
|
/** @brief Begin iterator. */
|
||||||
|
|
@ -356,67 +386,66 @@ namespace __gnu_parallel
|
||||||
difference_type count;
|
difference_type count;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Functor that does nothing */
|
/** @brief Functor that does nothing */
|
||||||
template<typename _ValueTp>
|
template<typename _ValueTp>
|
||||||
class void_functor
|
class void_functor
|
||||||
{
|
{
|
||||||
inline void
|
inline void
|
||||||
operator()(const _ValueTp& v) const { }
|
operator()(const _ValueTp& v) const { }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Compute the median of three referenced elements,
|
/** @brief Compute the median of three referenced elements,
|
||||||
according to @c comp.
|
according to @c comp.
|
||||||
* @param a First iterator.
|
* @param a First iterator.
|
||||||
* @param b Second iterator.
|
* @param b Second iterator.
|
||||||
* @param c Third iterator.
|
* @param c Third iterator.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
RandomAccessIterator
|
RandomAccessIterator
|
||||||
median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
|
median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
|
||||||
RandomAccessIterator c, Comparator& comp)
|
RandomAccessIterator c, Comparator& comp)
|
||||||
{
|
{
|
||||||
if (comp(*a, *b))
|
if (comp(*a, *b))
|
||||||
if (comp(*b, *c))
|
if (comp(*b, *c))
|
||||||
return b;
|
return b;
|
||||||
else
|
else
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
return c;
|
return c;
|
||||||
else
|
else
|
||||||
return a;
|
return a;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Just swap a and b.
|
// Just swap a and b.
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
return a;
|
return a;
|
||||||
else
|
else
|
||||||
if (comp(*b, *c))
|
if (comp(*b, *c))
|
||||||
return c;
|
return c;
|
||||||
else
|
else
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Avoid the use of assert, because we're trying to keep the <cassert>
|
// Avoid the use of assert, because we're trying to keep the <cassert>
|
||||||
// include out of the mix. (Same as debug mode).
|
// include out of the mix. (Same as debug mode).
|
||||||
inline void
|
inline void
|
||||||
__replacement_assert(const char* __file, int __line,
|
__replacement_assert(const char* __file, int __line,
|
||||||
const char* __function, const char* __condition)
|
const char* __function, const char* __condition)
|
||||||
{
|
{
|
||||||
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
|
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
|
||||||
__function, __condition);
|
__function, __condition);
|
||||||
__builtin_abort();
|
__builtin_abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
|
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
|
||||||
do \
|
do \
|
||||||
{ \
|
{ \
|
||||||
if (!(_Condition)) \
|
if (!(_Condition)) \
|
||||||
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
|
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
|
||||||
__PRETTY_FUNCTION__, #_Condition); \
|
__PRETTY_FUNCTION__, #_Condition); \
|
||||||
} while (false)
|
} while (false)
|
||||||
|
|
||||||
} //namespace __gnu_parallel
|
} //namespace __gnu_parallel
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
/** @brief Determine verbosity level of the parallel mode.
|
/** @brief Determine verbosity level of the parallel mode.
|
||||||
* Level 1 prints a message each time when entering a parallel-mode function. */
|
* Level 1 prints a message each time a parallel-mode function is entered. */
|
||||||
#define _GLIBCXX_VERBOSE_LEVEL 0
|
#define _GLIBCXX_VERBOSE_LEVEL 0
|
||||||
|
|
||||||
/** @def _GLIBCXX_CALL
|
/** @def _GLIBCXX_CALL
|
||||||
|
|
@ -50,27 +50,40 @@
|
||||||
#define _GLIBCXX_CALL(n)
|
#define _GLIBCXX_CALL(n)
|
||||||
#endif
|
#endif
|
||||||
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
|
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
|
||||||
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
|
#define _GLIBCXX_CALL(n) \
|
||||||
|
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
|
||||||
|
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef _GLIBCXX_SCALE_DOWN_FPU
|
||||||
/** @brief Use floating-point scaling instead of modulo for mapping
|
/** @brief Use floating-point scaling instead of modulo for mapping
|
||||||
* random numbers to a range. This can be faster on certain CPUs. */
|
* random numbers to a range. This can be faster on certain CPUs. */
|
||||||
#define _GLIBCXX_SCALE_DOWN_FPU 0
|
#define _GLIBCXX_SCALE_DOWN_FPU 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef _GLIBCXX_ASSERTIONS
|
||||||
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
||||||
* Should be switched on only locally. */
|
* Should be switched on only locally. */
|
||||||
#define _GLIBCXX_ASSERTIONS 0
|
#define _GLIBCXX_ASSERTIONS 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
||||||
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
||||||
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
|
* Consider the size of the L1 cache for
|
||||||
|
* __gnu_parallel::parallel_random_shuffle(). */
|
||||||
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
|
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
|
||||||
|
#endif
|
||||||
|
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
||||||
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
|
||||||
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
|
* Consider the size of the TLB for
|
||||||
|
* __gnu_parallel::parallel_random_shuffle(). */
|
||||||
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
|
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
||||||
/** @brief First copy the data, sort it locally, and merge it back
|
/** @brief First copy the data, sort it locally, and merge it back
|
||||||
* (0); or copy it back after everything is done (1).
|
* (0); or copy it back after everything is done (1).
|
||||||
*
|
*
|
||||||
* Recommendation: 0 */
|
* Recommendation: 0 */
|
||||||
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
|
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
|
||||||
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -39,30 +39,58 @@
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/** @brief Function to split a sequence into parts of almost equal size.
|
/** @brief Function to split a sequence into parts of almost equal size.
|
||||||
*
|
*
|
||||||
* The resulting sequence s of length p+1 contains the splitting
|
* The resulting sequence s of length num_threads+1 contains the splitting
|
||||||
* positions when splitting the range [0,n) into parts of almost
|
* positions when splitting the range [0,n) into parts of almost
|
||||||
* equal size (plus minus 1). The first entry is 0, the last one
|
* equal size (plus minus 1). The first entry is 0, the last one
|
||||||
* n. There may result empty parts.
|
* n. There may result empty parts.
|
||||||
* @param n Number of elements
|
* @param n Number of elements
|
||||||
* @param p Number of parts
|
* @param num_threads Number of parts
|
||||||
* @param s Splitters
|
* @param s Splitters
|
||||||
* @returns End of splitter sequence, i. e. @c s+p+1 */
|
* @returns End of splitter sequence, i. e. @c s+num_threads+1 */
|
||||||
template<typename _DifferenceTp, typename OutputIterator>
|
template<typename difference_type, typename OutputIterator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
|
equally_split(difference_type n,
|
||||||
|
thread_index_t num_threads,
|
||||||
|
OutputIterator s)
|
||||||
{
|
{
|
||||||
typedef _DifferenceTp difference_type;
|
difference_type chunk_length = n / num_threads,
|
||||||
difference_type chunk_length = n / p, split = n % p, start = 0;
|
num_longer_chunks = n % num_threads,
|
||||||
for (int i = 0; i < p; i++)
|
pos = 0;
|
||||||
|
for (thread_index_t i = 0; i < num_threads; ++i)
|
||||||
{
|
{
|
||||||
*s++ = start;
|
*s++ = pos;
|
||||||
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
|
pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
|
||||||
}
|
}
|
||||||
*s++ = n;
|
*s++ = n;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** @brief Function to split a sequence into parts of almost equal size.
|
||||||
|
*
|
||||||
|
* Returns the position of the splitting point between
|
||||||
|
* thread number thread_no (included) and
|
||||||
|
* thread number thread_no+1 (excluded).
|
||||||
|
* @param n Number of elements
|
||||||
|
* @param num_threads Number of parts
|
||||||
|
* @returns Splitting point */
|
||||||
|
template<typename difference_type>
|
||||||
|
difference_type
|
||||||
|
equally_split_point(difference_type n,
|
||||||
|
thread_index_t num_threads,
|
||||||
|
thread_index_t thread_no)
|
||||||
|
{
|
||||||
|
difference_type chunk_length = n / num_threads,
|
||||||
|
num_longer_chunks = n % num_threads;
|
||||||
|
|
||||||
|
if(thread_no < num_longer_chunks)
|
||||||
|
return thread_no * (chunk_length + 1);
|
||||||
|
else
|
||||||
|
return num_longer_chunks * (chunk_length + 1)
|
||||||
|
+ (thread_no - num_longer_chunks) * chunk_length;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,7 @@
|
||||||
* @brief Include guarded (sequences may run empty) loser tree,
|
* @brief Include guarded (sequences may run empty) loser tree,
|
||||||
* moving objects.
|
* moving objects.
|
||||||
* @see __gnu_parallel::Settings multiway_merge_algorithm */
|
* @see __gnu_parallel::Settings multiway_merge_algorithm */
|
||||||
#define _GLIBCXX_LOSER_TREE 0
|
#define _GLIBCXX_LOSER_TREE 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
|
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
// This library is distributed in the hope that it will be useful, but
|
// This library is distributed in the hope that it will be useful, but
|
||||||
// WITHOUT ANY WARRANTY; without even the implied warranty of
|
// WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE. See the GNU
|
||||||
// General Public License for more details.
|
// General Public License for more details.
|
||||||
|
|
||||||
// You should have received a copy of the GNU General Public License
|
// You should have received a copy of the GNU General Public License
|
||||||
|
|
@ -48,50 +48,66 @@
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @brief Parallel std::find, switch for different algorithms.
|
* @brief Parallel std::find, switch for different algorithms.
|
||||||
* @param begin1 Begin iterator of first sequence.
|
* @param begin1 Begin iterator of first sequence.
|
||||||
* @param end1 End iterator of first sequence.
|
* @param end1 End iterator of first sequence.
|
||||||
* @param begin2 Begin iterator of second sequence. Must have same
|
* @param begin2 Begin iterator of second sequence. Must have same
|
||||||
* length as first sequence.
|
* length as first sequence.
|
||||||
* @param pred Find predicate.
|
* @param pred Find predicate.
|
||||||
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
||||||
* @return Place of finding in both sequences.
|
* @return Place of finding in both sequences.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
|
template<
|
||||||
|
typename RandomAccessIterator1,
|
||||||
|
typename RandomAccessIterator2,
|
||||||
|
typename Pred,
|
||||||
|
typename Selector>
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
||||||
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
||||||
RandomAccessIterator2 begin2, Pred pred, Selector selector)
|
RandomAccessIterator2 begin2, Pred pred, Selector selector)
|
||||||
{
|
{
|
||||||
switch (Settings::find_distribution)
|
switch (Settings::find_distribution)
|
||||||
{
|
{
|
||||||
case Settings::GROWING_BLOCKS:
|
case Settings::GROWING_BLOCKS:
|
||||||
return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag());
|
return find_template(begin1, end1, begin2, pred, selector,
|
||||||
|
growing_blocks_tag());
|
||||||
case Settings::CONSTANT_SIZE_BLOCKS:
|
case Settings::CONSTANT_SIZE_BLOCKS:
|
||||||
return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag());
|
return find_template(begin1, end1, begin2, pred, selector,
|
||||||
|
constant_size_blocks_tag());
|
||||||
case Settings::EQUAL_SPLIT:
|
case Settings::EQUAL_SPLIT:
|
||||||
return find_template(begin1, end1, begin2, pred, selector, equal_split_tag());
|
return find_template(begin1, end1, begin2, pred, selector,
|
||||||
|
equal_split_tag());
|
||||||
default:
|
default:
|
||||||
_GLIBCXX_PARALLEL_ASSERT(false);
|
_GLIBCXX_PARALLEL_ASSERT(false);
|
||||||
return std::make_pair(begin1, begin2);
|
return std::make_pair(begin1, begin2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _GLIBCXX_FIND_EQUAL_SPLIT
|
#if _GLIBCXX_FIND_EQUAL_SPLIT
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parallel std::find, equal splitting variant.
|
* @brief Parallel std::find, equal splitting variant.
|
||||||
* @param begin1 Begin iterator of first sequence.
|
* @param begin1 Begin iterator of first sequence.
|
||||||
* @param end1 End iterator of first sequence.
|
* @param end1 End iterator of first sequence.
|
||||||
* @param begin2 Begin iterator of second sequence. Second sequence
|
* @param begin2 Begin iterator of second sequence. Second sequence
|
||||||
* must have same length as first sequence.
|
* must have same length as first sequence.
|
||||||
* @param pred Find predicate.
|
* @param pred Find predicate.
|
||||||
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
||||||
* @return Place of finding in both sequences.
|
* @return Place of finding in both sequences.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
|
template<
|
||||||
|
typename RandomAccessIterator1,
|
||||||
|
typename RandomAccessIterator2,
|
||||||
|
typename Pred,
|
||||||
|
typename Selector>
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
||||||
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag)
|
find_template(RandomAccessIterator1 begin1,
|
||||||
|
RandomAccessIterator1 end1,
|
||||||
|
RandomAccessIterator2 begin2,
|
||||||
|
Pred pred,
|
||||||
|
Selector selector,
|
||||||
|
equal_split_tag)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(end1 - begin1)
|
_GLIBCXX_CALL(end1 - begin1)
|
||||||
|
|
||||||
|
|
@ -100,79 +116,89 @@ namespace __gnu_parallel
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
||||||
difference_type length = end1 - begin1;
|
difference_type length = end1 - begin1;
|
||||||
|
|
||||||
difference_type result = length;
|
difference_type result = length;
|
||||||
|
difference_type* borders;
|
||||||
|
|
||||||
const thread_index_t num_threads = get_max_threads();
|
|
||||||
omp_lock_t result_lock;
|
omp_lock_t result_lock;
|
||||||
omp_init_lock(&result_lock);
|
omp_init_lock(&result_lock);
|
||||||
|
|
||||||
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
|
thread_index_t num_threads = get_max_threads();
|
||||||
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
borders = new difference_type[num_threads + 1];
|
||||||
|
equally_split(length, num_threads, borders);
|
||||||
|
} //single
|
||||||
|
|
||||||
equally_split(length, num_threads, borders);
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
difference_type start = borders[iam], stop = borders[iam + 1];
|
||||||
|
|
||||||
#pragma omp parallel shared(result) num_threads(num_threads)
|
RandomAccessIterator1 i1 = begin1 + start;
|
||||||
{
|
RandomAccessIterator2 i2 = begin2 + start;
|
||||||
int iam = omp_get_thread_num();
|
for (difference_type pos = start; pos < stop; ++pos)
|
||||||
difference_type pos = borders[iam], limit = borders[iam + 1];
|
{
|
||||||
|
#pragma omp flush(result)
|
||||||
RandomAccessIterator1 i1 = begin1 + pos;
|
// Result has been set to something lower.
|
||||||
RandomAccessIterator2 i2 = begin2 + pos;
|
if (result < pos)
|
||||||
for (; pos < limit; pos++)
|
|
||||||
{
|
|
||||||
#pragma omp flush(result)
|
|
||||||
// Result has been set to something lower.
|
|
||||||
if (result < pos)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (selector(i1, i2, pred))
|
|
||||||
{
|
|
||||||
omp_set_lock(&result_lock);
|
|
||||||
if (result > pos)
|
|
||||||
result = pos;
|
|
||||||
omp_unset_lock(&result_lock);
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
i1++;
|
if (selector(i1, i2, pred))
|
||||||
i2++;
|
{
|
||||||
}
|
omp_set_lock(&result_lock);
|
||||||
}
|
if (pos < result)
|
||||||
|
result = pos;
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++i1;
|
||||||
|
++i2;
|
||||||
|
}
|
||||||
|
} //parallel
|
||||||
|
|
||||||
omp_destroy_lock(&result_lock);
|
omp_destroy_lock(&result_lock);
|
||||||
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
|
delete[] borders;
|
||||||
|
|
||||||
|
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
|
||||||
|
begin1 + result, begin2 + result);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if _GLIBCXX_FIND_GROWING_BLOCKS
|
#if _GLIBCXX_FIND_GROWING_BLOCKS
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parallel std::find, growing block size variant.
|
* @brief Parallel std::find, growing block size variant.
|
||||||
* @param begin1 Begin iterator of first sequence.
|
* @param begin1 Begin iterator of first sequence.
|
||||||
* @param end1 End iterator of first sequence.
|
* @param end1 End iterator of first sequence.
|
||||||
* @param begin2 Begin iterator of second sequence. Second sequence
|
* @param begin2 Begin iterator of second sequence. Second sequence
|
||||||
* must have same length as first sequence.
|
* must have same length as first sequence.
|
||||||
* @param pred Find predicate.
|
* @param pred Find predicate.
|
||||||
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
||||||
* @return Place of finding in both sequences.
|
* @return Place of finding in both sequences.
|
||||||
* @see __gnu_parallel::Settings::find_sequential_search_size
|
* @see __gnu_parallel::Settings::find_sequential_search_size
|
||||||
* @see __gnu_parallel::Settings::find_initial_block_size
|
* @see __gnu_parallel::Settings::find_initial_block_size
|
||||||
* @see __gnu_parallel::Settings::find_maximum_block_size
|
* @see __gnu_parallel::Settings::find_maximum_block_size
|
||||||
* @see __gnu_parallel::Settings::find_increasing_factor
|
* @see __gnu_parallel::Settings::find_increasing_factor
|
||||||
*
|
*
|
||||||
* There are two main differences between the growing blocks and
|
* There are two main differences between the growing blocks and
|
||||||
* the constant-size blocks variants.
|
* the constant-size blocks variants.
|
||||||
* 1. For GB, the block size grows; for CSB, the block size is fixed.
|
* 1. For GB, the block size grows; for CSB, the block size is fixed.
|
||||||
|
|
||||||
* 2. For GB, the blocks are allocated dynamically;
|
* 2. For GB, the blocks are allocated dynamically;
|
||||||
* for CSB, the blocks are allocated in a predetermined manner,
|
* for CSB, the blocks are allocated in a predetermined manner,
|
||||||
* namely spacial round-robin.
|
* namely spacial round-robin.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
|
template<
|
||||||
|
typename RandomAccessIterator1,
|
||||||
|
typename RandomAccessIterator2,
|
||||||
|
typename Pred,
|
||||||
|
typename Selector>
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
||||||
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
||||||
RandomAccessIterator2 begin2, Pred pred, Selector selector,
|
RandomAccessIterator2 begin2, Pred pred, Selector selector,
|
||||||
growing_blocks_tag)
|
growing_blocks_tag)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(end1 - begin1)
|
_GLIBCXX_CALL(end1 - begin1)
|
||||||
|
|
||||||
|
|
@ -182,101 +208,118 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
difference_type length = end1 - begin1;
|
difference_type length = end1 - begin1;
|
||||||
|
|
||||||
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
|
difference_type sequential_search_size = std::min<difference_type>(
|
||||||
|
length, Settings::find_sequential_search_size);
|
||||||
|
|
||||||
// Try it sequentially first.
|
// Try it sequentially first.
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
|
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
|
||||||
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
|
selector.sequential_algorithm(
|
||||||
|
begin1, begin1 + sequential_search_size, begin2, pred);
|
||||||
|
|
||||||
if (find_seq_result.first != (begin1 + sequential_search_size))
|
if (find_seq_result.first != (begin1 + sequential_search_size))
|
||||||
return find_seq_result;
|
return find_seq_result;
|
||||||
|
|
||||||
// Index of beginning of next free block (after sequential find).
|
// Index of beginning of next free block (after sequential find).
|
||||||
difference_type next_block_pos = sequential_search_size;
|
difference_type next_block_start = sequential_search_size;
|
||||||
difference_type result = length;
|
difference_type result = length;
|
||||||
const thread_index_t num_threads = get_max_threads();
|
|
||||||
|
|
||||||
omp_lock_t result_lock;
|
omp_lock_t result_lock;
|
||||||
omp_init_lock(&result_lock);
|
omp_init_lock(&result_lock);
|
||||||
|
|
||||||
#pragma omp parallel shared(result) num_threads(num_threads)
|
thread_index_t num_threads = get_max_threads();
|
||||||
{
|
# pragma omp parallel shared(result) num_threads(num_threads)
|
||||||
// Not within first k elements -> start parallel.
|
{
|
||||||
thread_index_t iam = omp_get_thread_num();
|
# pragma omp single
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
|
||||||
difference_type block_size = Settings::find_initial_block_size;
|
// Not within first k elements -> start parallel.
|
||||||
difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size);
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
// Get new block, update pointer to next block.
|
difference_type block_size = Settings::find_initial_block_size;
|
||||||
difference_type stop = std::min<difference_type>(length, start + block_size);
|
difference_type start =
|
||||||
|
fetch_and_add<difference_type>(&next_block_start, block_size);
|
||||||
|
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
|
// Get new block, update pointer to next block.
|
||||||
|
difference_type stop =
|
||||||
|
std::min<difference_type>(length, start + block_size);
|
||||||
|
|
||||||
while (start < length)
|
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
|
||||||
{
|
|
||||||
#pragma omp flush(result)
|
|
||||||
// Get new value of result.
|
|
||||||
if (result < start)
|
|
||||||
{
|
|
||||||
// No chance to find first element.
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
|
while (start < length)
|
||||||
if (local_result.first != (begin1 + stop))
|
{
|
||||||
{
|
# pragma omp flush(result)
|
||||||
omp_set_lock(&result_lock);
|
// Get new value of result.
|
||||||
if ((local_result.first - begin1) < result)
|
if (result < start)
|
||||||
{
|
{
|
||||||
result = local_result.first - begin1;
|
// No chance to find first element.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// Result cannot be in future blocks, stop algorithm.
|
local_result = selector.sequential_algorithm(
|
||||||
fetch_and_add<difference_type>(&next_block_pos, length);
|
begin1 + start, begin1 + stop, begin2 + start, pred);
|
||||||
}
|
if (local_result.first != (begin1 + stop))
|
||||||
omp_unset_lock(&result_lock);
|
{
|
||||||
}
|
omp_set_lock(&result_lock);
|
||||||
|
if ((local_result.first - begin1) < result)
|
||||||
|
{
|
||||||
|
result = local_result.first - begin1;
|
||||||
|
|
||||||
block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size);
|
// Result cannot be in future blocks, stop algorithm.
|
||||||
|
fetch_and_add<difference_type>(&next_block_start, length);
|
||||||
|
}
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
}
|
||||||
|
|
||||||
// Get new block, update pointer to next block.
|
block_size = std::min<difference_type>(
|
||||||
start = fetch_and_add<difference_type>(&next_block_pos, block_size);
|
block_size * Settings::find_increasing_factor,
|
||||||
stop = (length < (start + block_size)) ? length : (start + block_size);
|
Settings::find_maximum_block_size);
|
||||||
}
|
|
||||||
}
|
// Get new block, update pointer to next block.
|
||||||
|
start =
|
||||||
|
fetch_and_add<difference_type>(&next_block_start, block_size);
|
||||||
|
stop = (length < (start + block_size)) ?
|
||||||
|
length : (start + block_size);
|
||||||
|
}
|
||||||
|
} //parallel
|
||||||
|
|
||||||
omp_destroy_lock(&result_lock);
|
omp_destroy_lock(&result_lock);
|
||||||
|
|
||||||
// Return iterator on found element.
|
// Return iterator on found element.
|
||||||
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
|
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
|
||||||
|
begin1 + result, begin2 + result);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
|
#if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parallel std::find, constant block size variant.
|
* @brief Parallel std::find, constant block size variant.
|
||||||
* @param begin1 Begin iterator of first sequence.
|
* @param begin1 Begin iterator of first sequence.
|
||||||
* @param end1 End iterator of first sequence.
|
* @param end1 End iterator of first sequence.
|
||||||
* @param begin2 Begin iterator of second sequence. Second sequence
|
* @param begin2 Begin iterator of second sequence. Second sequence
|
||||||
* must have same length as first sequence.
|
* must have same length as first sequence.
|
||||||
* @param pred Find predicate.
|
* @param pred Find predicate.
|
||||||
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
* @param selector Functionality (e. g. std::find_if (), std::equal(),...)
|
||||||
* @return Place of finding in both sequences.
|
* @return Place of finding in both sequences.
|
||||||
* @see __gnu_parallel::Settings::find_sequential_search_size
|
* @see __gnu_parallel::Settings::find_sequential_search_size
|
||||||
* @see __gnu_parallel::Settings::find_block_size
|
* @see __gnu_parallel::Settings::find_block_size
|
||||||
* There are two main differences between the growing blocks and the
|
* There are two main differences between the growing blocks and the
|
||||||
* constant-size blocks variants.
|
* constant-size blocks variants.
|
||||||
* 1. For GB, the block size grows; for CSB, the block size is fixed.
|
* 1. For GB, the block size grows; for CSB, the block size is fixed.
|
||||||
* 2. For GB, the blocks are allocated dynamically; for CSB, the
|
* 2. For GB, the blocks are allocated dynamically; for CSB, the
|
||||||
* blocks are allocated in a predetermined manner, namely spacial
|
* blocks are allocated in a predetermined manner, namely spacial
|
||||||
* round-robin.
|
* round-robin.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector>
|
template<
|
||||||
|
typename RandomAccessIterator1,
|
||||||
|
typename RandomAccessIterator2,
|
||||||
|
typename Pred,
|
||||||
|
typename Selector>
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
std::pair<RandomAccessIterator1, RandomAccessIterator2>
|
||||||
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
|
||||||
RandomAccessIterator2 begin2, Pred pred, Selector selector,
|
RandomAccessIterator2 begin2, Pred pred, Selector selector,
|
||||||
constant_size_blocks_tag)
|
constant_size_blocks_tag)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(end1 - begin1)
|
_GLIBCXX_CALL(end1 - begin1)
|
||||||
typedef std::iterator_traits<RandomAccessIterator1> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator1> traits_type;
|
||||||
|
|
@ -285,72 +328,77 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
difference_type length = end1 - begin1;
|
difference_type length = end1 - begin1;
|
||||||
|
|
||||||
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size);
|
difference_type sequential_search_size = std::min<difference_type>(
|
||||||
|
length, Settings::find_sequential_search_size);
|
||||||
|
|
||||||
// Try it sequentially first.
|
// Try it sequentially first.
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
|
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
|
||||||
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred);
|
selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
|
||||||
|
begin2, pred);
|
||||||
|
|
||||||
if (find_seq_result.first != (begin1 + sequential_search_size))
|
if (find_seq_result.first != (begin1 + sequential_search_size))
|
||||||
return find_seq_result;
|
return find_seq_result;
|
||||||
|
|
||||||
difference_type result = length;
|
difference_type result = length;
|
||||||
const thread_index_t num_threads = get_max_threads();
|
|
||||||
|
|
||||||
omp_lock_t result_lock;
|
omp_lock_t result_lock;
|
||||||
omp_init_lock(&result_lock);
|
omp_init_lock(&result_lock);
|
||||||
|
|
||||||
// Not within first sequential_search_size elements -> start parallel.
|
// Not within first sequential_search_size elements -> start parallel.
|
||||||
#pragma omp parallel shared(result) num_threads(num_threads)
|
|
||||||
{
|
|
||||||
thread_index_t iam = omp_get_thread_num();
|
|
||||||
difference_type block_size = Settings::find_initial_block_size;
|
|
||||||
|
|
||||||
difference_type start, stop;
|
thread_index_t num_threads = get_max_threads();
|
||||||
|
# pragma omp parallel shared(result) num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
|
||||||
// First element of thread's current iteration.
|
thread_index_t iam = omp_get_thread_num();
|
||||||
difference_type iteration_start = sequential_search_size;
|
difference_type block_size = Settings::find_initial_block_size;
|
||||||
|
|
||||||
// Where to work (initialization).
|
// First element of thread's current iteration.
|
||||||
start = iteration_start + iam * block_size;
|
difference_type iteration_start = sequential_search_size;
|
||||||
stop = std::min<difference_type>(length, start + block_size);
|
|
||||||
|
|
||||||
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
|
// Where to work (initialization).
|
||||||
|
difference_type start = iteration_start + iam * block_size;
|
||||||
|
difference_type stop =
|
||||||
|
std::min<difference_type>(length, start + block_size);
|
||||||
|
|
||||||
while (start < length)
|
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
|
||||||
{
|
|
||||||
// Get new value of result.
|
|
||||||
#pragma omp flush(result)
|
|
||||||
// No chance to find first element.
|
|
||||||
if (result < start)
|
|
||||||
break;
|
|
||||||
|
|
||||||
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred);
|
while (start < length)
|
||||||
if (local_result.first != (begin1 + stop))
|
{
|
||||||
{
|
// Get new value of result.
|
||||||
omp_set_lock(&result_lock);
|
# pragma omp flush(result)
|
||||||
if ((local_result.first - begin1) < result)
|
// No chance to find first element.
|
||||||
result = local_result.first - begin1;
|
if (result < start)
|
||||||
omp_unset_lock(&result_lock);
|
break;
|
||||||
// Will not find better value in its interval.
|
local_result = selector.sequential_algorithm(
|
||||||
break;
|
begin1 + start, begin1 + stop,
|
||||||
}
|
begin2 + start, pred);
|
||||||
|
if (local_result.first != (begin1 + stop))
|
||||||
|
{
|
||||||
|
omp_set_lock(&result_lock);
|
||||||
|
if ((local_result.first - begin1) < result)
|
||||||
|
result = local_result.first - begin1;
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
// Will not find better value in its interval.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
iteration_start += num_threads * block_size;
|
iteration_start += num_threads * block_size;
|
||||||
|
|
||||||
// Where to work.
|
// Where to work.
|
||||||
start = iteration_start + iam * block_size;
|
start = iteration_start + iam * block_size;
|
||||||
stop = std::min<difference_type>(length, start + block_size);
|
stop = std::min<difference_type>(length, start + block_size);
|
||||||
}
|
}
|
||||||
}
|
} //parallel
|
||||||
|
|
||||||
omp_destroy_lock(&result_lock);
|
omp_destroy_lock(&result_lock);
|
||||||
|
|
||||||
// Return iterator on found element.
|
// Return iterator on found element.
|
||||||
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result);
|
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
|
||||||
|
begin1 + result, begin2 + result);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end namespace
|
} // end namespace
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -48,8 +48,8 @@
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
|
|
||||||
/** @brief Subsequence description. */
|
/** @brief Subsequence description. */
|
||||||
template<typename _DifferenceTp>
|
template<typename _DifferenceTp>
|
||||||
struct Piece
|
struct Piece
|
||||||
{
|
{
|
||||||
typedef _DifferenceTp difference_type;
|
typedef _DifferenceTp difference_type;
|
||||||
|
|
@ -61,16 +61,19 @@ namespace __gnu_parallel
|
||||||
difference_type end;
|
difference_type end;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Data accessed by all threads.
|
/** @brief Data accessed by all threads.
|
||||||
*
|
*
|
||||||
* PMWMS = parallel multiway mergesort */
|
* PMWMS = parallel multiway mergesort */
|
||||||
template<typename RandomAccessIterator>
|
template<typename RandomAccessIterator>
|
||||||
struct PMWMSSortingData
|
struct PMWMSSortingData
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
|
/** @brief Number of threads involved. */
|
||||||
|
thread_index_t num_threads;
|
||||||
|
|
||||||
/** @brief Input begin. */
|
/** @brief Input begin. */
|
||||||
RandomAccessIterator source;
|
RandomAccessIterator source;
|
||||||
|
|
||||||
|
|
@ -105,62 +108,55 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
/** @brief Pieces of data to merge @c [thread][sequence] */
|
/** @brief Pieces of data to merge @c [thread][sequence] */
|
||||||
std::vector<Piece<difference_type> >* pieces;
|
std::vector<Piece<difference_type> >* pieces;
|
||||||
};
|
|
||||||
|
|
||||||
/** @brief Thread local data for PMWMS. */
|
|
||||||
template<typename RandomAccessIterator>
|
|
||||||
struct PMWMSSorterPU
|
|
||||||
{
|
|
||||||
/** @brief Total number of thread involved. */
|
|
||||||
thread_index_t num_threads;
|
|
||||||
/** @brief Number of owning thread. */
|
|
||||||
thread_index_t iam;
|
|
||||||
/** @brief Stable sorting desired. */
|
/** @brief Stable sorting desired. */
|
||||||
bool stable;
|
bool stable;
|
||||||
/** @brief Pointer to global data. */
|
};
|
||||||
PMWMSSortingData<RandomAccessIterator>* sd;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Select samples from a sequence.
|
* @brief Select samples from a sequence.
|
||||||
* @param d Pointer to thread-local data. Result will be placed in
|
* @param sd Pointer to algorithm data. Result will be placed in
|
||||||
* @c d->ds->samples.
|
* @c sd->samples.
|
||||||
* @param num_samples Number of samples to select.
|
* @param num_samples Number of samples to select.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename _DifferenceTp>
|
template<typename RandomAccessIterator, typename _DifferenceTp>
|
||||||
inline void
|
inline void
|
||||||
determine_samples(PMWMSSorterPU<RandomAccessIterator>* d,
|
determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
|
||||||
_DifferenceTp& num_samples)
|
_DifferenceTp& num_samples)
|
||||||
{
|
{
|
||||||
typedef _DifferenceTp difference_type;
|
typedef _DifferenceTp difference_type;
|
||||||
|
|
||||||
PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1;
|
num_samples =
|
||||||
|
Settings::sort_mwms_oversampling * sd->num_threads - 1;
|
||||||
|
|
||||||
difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2)));
|
difference_type* es = new difference_type[num_samples + 2];
|
||||||
|
|
||||||
equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es);
|
equally_split(sd->starts[iam + 1] - sd->starts[iam],
|
||||||
|
num_samples + 1, es);
|
||||||
|
|
||||||
for (difference_type i = 0; i < num_samples; i++)
|
for (difference_type i = 0; i < num_samples; i++)
|
||||||
sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]];
|
sd->samples[iam * num_samples + i] =
|
||||||
|
sd->source[sd->starts[iam] + es[i + 1]];
|
||||||
|
|
||||||
|
delete[] es;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief PMWMS code executed by each thread.
|
/** @brief PMWMS code executed by each thread.
|
||||||
* @param d Pointer to thread-local data.
|
* @param sd Pointer to algorithm data.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d,
|
parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
|
||||||
Comparator& comp)
|
Comparator& comp)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
|
thread_index_t iam = omp_get_thread_num();
|
||||||
thread_index_t iam = d->iam;
|
|
||||||
|
|
||||||
// Length of this thread's chunk, before merging.
|
// Length of this thread's chunk, before merging.
|
||||||
difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
|
difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
|
||||||
|
|
@ -174,161 +170,168 @@ namespace __gnu_parallel
|
||||||
typedef value_type* SortingPlacesIterator;
|
typedef value_type* SortingPlacesIterator;
|
||||||
|
|
||||||
// Sort in temporary storage, leave space for sentinel.
|
// Sort in temporary storage, leave space for sentinel.
|
||||||
sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1)));
|
sd->sorting_places[iam] = sd->temporaries[iam] =
|
||||||
|
static_cast<value_type*>(
|
||||||
|
::operator new(sizeof(value_type) * (length_local + 1)));
|
||||||
|
|
||||||
// Copy there.
|
// Copy there.
|
||||||
std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]);
|
std::uninitialized_copy(sd->source + sd->starts[iam],
|
||||||
|
sd->source + sd->starts[iam] + length_local,
|
||||||
|
sd->sorting_places[iam]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Sort locally.
|
// Sort locally.
|
||||||
if (d->stable)
|
if (sd->stable)
|
||||||
__gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
|
__gnu_sequential::stable_sort(sd->sorting_places[iam],
|
||||||
|
sd->sorting_places[iam] + length_local,
|
||||||
|
comp);
|
||||||
else
|
else
|
||||||
__gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp);
|
__gnu_sequential::sort(sd->sorting_places[iam],
|
||||||
|
sd->sorting_places[iam] + length_local,
|
||||||
#if _GLIBCXX_ASSERTIONS
|
comp);
|
||||||
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Invariant: locally sorted subsequence in sd->sorting_places[iam],
|
// Invariant: locally sorted subsequence in sd->sorting_places[iam],
|
||||||
// sd->sorting_places[iam] + length_local.
|
// sd->sorting_places[iam] + length_local.
|
||||||
|
|
||||||
if (Settings::sort_splitting == Settings::SAMPLING)
|
if (Settings::sort_splitting == Settings::SAMPLING)
|
||||||
{
|
{
|
||||||
difference_type num_samples;
|
difference_type num_samples;
|
||||||
determine_samples(d, num_samples);
|
determine_samples(sd, num_samples);
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
#pragma omp single
|
# pragma omp single
|
||||||
__gnu_sequential::sort(sd->samples,
|
__gnu_sequential::sort(sd->samples,
|
||||||
sd->samples + (num_samples * d->num_threads),
|
sd->samples + (num_samples * sd->num_threads),
|
||||||
comp);
|
comp);
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
for (int s = 0; s < d->num_threads; s++)
|
for (int s = 0; s < sd->num_threads; s++)
|
||||||
{
|
{
|
||||||
// For each sequence.
|
// For each sequence.
|
||||||
if (num_samples * iam > 0)
|
if (num_samples * iam > 0)
|
||||||
sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s],
|
sd->pieces[iam][s].begin =
|
||||||
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
|
std::lower_bound(sd->sorting_places[s],
|
||||||
sd->samples[num_samples * iam],
|
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
|
||||||
comp)
|
sd->samples[num_samples * iam],
|
||||||
- sd->sorting_places[s];
|
comp)
|
||||||
else
|
- sd->sorting_places[s];
|
||||||
// Absolute beginning.
|
else
|
||||||
sd->pieces[iam][s].begin = 0;
|
// Absolute beginning.
|
||||||
|
sd->pieces[iam][s].begin = 0;
|
||||||
if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
|
|
||||||
sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s],
|
|
||||||
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp)
|
|
||||||
- sd->sorting_places[s];
|
|
||||||
else
|
|
||||||
// Absolute end.
|
|
||||||
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
|
||||||
|
sd->pieces[iam][s].end =
|
||||||
|
std::lower_bound(sd->sorting_places[s],
|
||||||
|
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
|
||||||
|
sd->samples[num_samples * (iam + 1)], comp)
|
||||||
|
- sd->sorting_places[s];
|
||||||
|
else
|
||||||
|
// Absolute end.
|
||||||
|
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (Settings::sort_splitting == Settings::EXACT)
|
else if (Settings::sort_splitting == Settings::EXACT)
|
||||||
{
|
{
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
|
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
|
||||||
for (int s = 0; s < d->num_threads; s++)
|
seqs(sd->num_threads);
|
||||||
seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
|
for (int s = 0; s < sd->num_threads; s++)
|
||||||
|
seqs[s] = std::make_pair(sd->sorting_places[s],
|
||||||
|
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
|
||||||
|
|
||||||
std::vector<SortingPlacesIterator> offsets(d->num_threads);
|
std::vector<SortingPlacesIterator> offsets(sd->num_threads);
|
||||||
|
|
||||||
// If not last thread.
|
// if not last thread
|
||||||
if (iam < d->num_threads - 1)
|
if (iam < sd->num_threads - 1)
|
||||||
multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp);
|
multiseq_partition(seqs.begin(), seqs.end(),
|
||||||
|
sd->starts[iam + 1], offsets.begin(), comp);
|
||||||
|
|
||||||
for (int seq = 0; seq < d->num_threads; seq++)
|
for (int seq = 0; seq < sd->num_threads; seq++)
|
||||||
{
|
{
|
||||||
// For each sequence.
|
// for each sequence
|
||||||
if (iam < (d->num_threads - 1))
|
if (iam < (sd->num_threads - 1))
|
||||||
sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
|
sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
|
||||||
else
|
else
|
||||||
// Absolute end of this sequence.
|
// very end of this sequence
|
||||||
sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
|
sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
for (int seq = 0; seq < d->num_threads; seq++)
|
for (int seq = 0; seq < sd->num_threads; seq++)
|
||||||
{
|
{
|
||||||
// For each sequence.
|
// For each sequence.
|
||||||
if (iam > 0)
|
if (iam > 0)
|
||||||
sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
|
sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
|
||||||
else
|
else
|
||||||
// Absolute beginning.
|
// Absolute beginning.
|
||||||
sd->pieces[iam][seq].begin = 0;
|
sd->pieces[iam][seq].begin = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Offset from target begin, length after merging.
|
// Offset from target begin, length after merging.
|
||||||
difference_type offset = 0, length_am = 0;
|
difference_type offset = 0, length_am = 0;
|
||||||
for (int s = 0; s < d->num_threads; s++)
|
for (int s = 0; s < sd->num_threads; s++)
|
||||||
{
|
{
|
||||||
length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
|
length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
|
||||||
offset += sd->pieces[iam][s].begin;
|
offset += sd->pieces[iam][s].begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
||||||
// Merge to temporary storage, uninitialized creation not possible
|
// Merge to temporary storage, uninitialized creation not possible
|
||||||
// since there is no multiway_merge calling the placement new
|
// since there is no multiway_merge calling the placement new
|
||||||
// instead of the assignment operator.
|
// instead of the assignment operator.
|
||||||
sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am));
|
sd->merging_places[iam] = sd->temporaries[iam] =
|
||||||
|
static_cast<value_type*>(
|
||||||
|
::operator new(sizeof(value_type) * length_am));
|
||||||
#else
|
#else
|
||||||
// Merge directly to target.
|
// Merge directly to target.
|
||||||
sd->merging_places[iam] = sd->source + offset;
|
sd->merging_places[iam] = sd->source + offset;
|
||||||
#endif
|
#endif
|
||||||
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
|
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
|
||||||
|
seqs(sd->num_threads);
|
||||||
|
|
||||||
for (int s = 0; s < d->num_threads; s++)
|
for (int s = 0; s < sd->num_threads; s++)
|
||||||
{
|
{
|
||||||
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end);
|
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
|
||||||
|
sd->sorting_places[s] + sd->pieces[iam][s].end);
|
||||||
#if _GLIBCXX_ASSERTIONS
|
|
||||||
_GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag());
|
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
# pragma omp barrier
|
||||||
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
# pragma omp barrier
|
|
||||||
|
|
||||||
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
||||||
// Write back.
|
// Write back.
|
||||||
std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am,
|
std::copy(sd->merging_places[iam],
|
||||||
sd->source + offset);
|
sd->merging_places[iam] + length_am,
|
||||||
|
sd->source + offset);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
delete[] sd->temporaries[iam];
|
delete[] sd->temporaries[iam];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief PMWMS main call.
|
/** @brief PMWMS main call.
|
||||||
* @param begin Begin iterator of sequence.
|
* @param begin Begin iterator of sequence.
|
||||||
* @param end End iterator of sequence.
|
* @param end End iterator of sequence.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
* @param n Length of sequence.
|
* @param n Length of sequence.
|
||||||
* @param num_threads Number of threads to use.
|
* @param num_threads Number of threads to use.
|
||||||
* @param stable Stable sorting.
|
* @param stable Stable sorting.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
|
parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
|
||||||
Comparator comp,
|
Comparator comp,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
|
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
|
||||||
int num_threads, bool stable)
|
int num_threads,
|
||||||
|
bool stable)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(n)
|
_GLIBCXX_CALL(n)
|
||||||
|
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
@ -336,75 +339,75 @@ namespace __gnu_parallel
|
||||||
if (n <= 1)
|
if (n <= 1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// At least one element per thread.
|
// at least one element per thread
|
||||||
if (num_threads > n)
|
if (num_threads > n)
|
||||||
num_threads = static_cast<thread_index_t>(n);
|
num_threads = static_cast<thread_index_t>(n);
|
||||||
|
|
||||||
|
// shared variables
|
||||||
PMWMSSortingData<RandomAccessIterator> sd;
|
PMWMSSortingData<RandomAccessIterator> sd;
|
||||||
|
difference_type* starts;
|
||||||
|
|
||||||
sd.source = begin;
|
# pragma omp parallel num_threads(num_threads)
|
||||||
sd.temporaries = new value_type*[num_threads];
|
{
|
||||||
|
num_threads = omp_get_num_threads(); //no more threads than requested
|
||||||
|
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
sd.num_threads = num_threads;
|
||||||
|
sd.source = begin;
|
||||||
|
sd.temporaries = new value_type*[num_threads];
|
||||||
|
|
||||||
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
|
||||||
sd.sorting_places = new RandomAccessIterator[num_threads];
|
sd.sorting_places = new RandomAccessIterator[num_threads];
|
||||||
sd.merging_places = new value_type*[num_threads];
|
sd.merging_places = new value_type*[num_threads];
|
||||||
#else
|
#else
|
||||||
sd.sorting_places = new value_type*[num_threads];
|
sd.sorting_places = new value_type*[num_threads];
|
||||||
sd.merging_places = new RandomAccessIterator[num_threads];
|
sd.merging_places = new RandomAccessIterator[num_threads];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (Settings::sort_splitting == Settings::SAMPLING)
|
if (Settings::sort_splitting == Settings::SAMPLING)
|
||||||
{
|
{
|
||||||
unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1;
|
unsigned int size =
|
||||||
sz *= num_threads;
|
(Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
|
||||||
|
sd.samples = static_cast<value_type*>(
|
||||||
// Equivalent to value_type[sz], without need of default construction.
|
::operator new(size * sizeof(value_type)));
|
||||||
sz *= sizeof(value_type);
|
}
|
||||||
sd.samples = static_cast<value_type*>(::operator new(sz));
|
else
|
||||||
}
|
sd.samples = NULL;
|
||||||
else
|
|
||||||
sd.samples = NULL;
|
|
||||||
|
|
||||||
sd.offsets = new difference_type[num_threads - 1];
|
sd.offsets = new difference_type[num_threads - 1];
|
||||||
sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
|
sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
|
||||||
for (int s = 0; s < num_threads; s++)
|
for (int s = 0; s < num_threads; s++)
|
||||||
sd.pieces[s].resize(num_threads);
|
sd.pieces[s].resize(num_threads);
|
||||||
PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads];
|
starts = sd.starts = new difference_type[num_threads + 1];
|
||||||
difference_type* starts = sd.starts = new difference_type[num_threads + 1];
|
sd.stable = stable;
|
||||||
|
|
||||||
difference_type chunk_length = n / num_threads;
|
difference_type chunk_length = n / num_threads;
|
||||||
difference_type split = n % num_threads;
|
difference_type split = n % num_threads;
|
||||||
difference_type start = 0;
|
difference_type pos = 0;
|
||||||
for (int i = 0; i < num_threads; i++)
|
for (int i = 0; i < num_threads; i++)
|
||||||
{
|
{
|
||||||
starts[i] = start;
|
starts[i] = pos;
|
||||||
start += (i < split) ? (chunk_length + 1) : chunk_length;
|
pos += (i < split) ? (chunk_length + 1) : chunk_length;
|
||||||
pus[i].num_threads = num_threads;
|
}
|
||||||
pus[i].iam = i;
|
starts[num_threads] = pos;
|
||||||
pus[i].sd = &sd;
|
}
|
||||||
pus[i].stable = stable;
|
|
||||||
}
|
|
||||||
starts[num_threads] = start;
|
|
||||||
|
|
||||||
// Now sort in parallel.
|
// Now sort in parallel.
|
||||||
#pragma omp parallel num_threads(num_threads)
|
parallel_sort_mwms_pu(&sd, comp);
|
||||||
parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp);
|
} //parallel
|
||||||
|
|
||||||
// XXX sd as RAII
|
|
||||||
delete[] starts;
|
delete[] starts;
|
||||||
delete[] sd.temporaries;
|
delete[] sd.temporaries;
|
||||||
delete[] sd.sorting_places;
|
delete[] sd.sorting_places;
|
||||||
delete[] sd.merging_places;
|
delete[] sd.merging_places;
|
||||||
|
|
||||||
if (Settings::sort_splitting == Settings::SAMPLING)
|
if (Settings::sort_splitting == Settings::SAMPLING)
|
||||||
delete[] sd.samples;
|
delete[] sd.samples;
|
||||||
|
|
||||||
delete[] sd.offsets;
|
delete[] sd.offsets;
|
||||||
delete[] sd.pieces;
|
delete[] sd.pieces;
|
||||||
|
|
||||||
delete[] pus;
|
|
||||||
}
|
}
|
||||||
|
} //namespace __gnu_parallel
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -43,54 +43,71 @@
|
||||||
|
|
||||||
#include <parallel/settings.h>
|
#include <parallel/settings.h>
|
||||||
#include <parallel/basic_iterator.h>
|
#include <parallel/basic_iterator.h>
|
||||||
|
#include <parallel/base.h>
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/** @brief Embarrassingly parallel algorithm for random access
|
/** @brief Embarrassingly parallel algorithm for random access
|
||||||
* iterators, using an OpenMP for loop.
|
* iterators, using an OpenMP for loop.
|
||||||
*
|
*
|
||||||
* @param begin Begin iterator of element sequence.
|
* @param begin Begin iterator of element sequence.
|
||||||
* @param end End iterator of element sequence.
|
* @param end End iterator of element sequence.
|
||||||
* @param o User-supplied functor (comparator, predicate, adding
|
* @param o User-supplied functor (comparator, predicate, adding
|
||||||
* functor, etc.).
|
* functor, etc.).
|
||||||
* @param f Functor to "process" an element with op (depends on
|
* @param f Functor to "process" an element with op (depends on
|
||||||
* desired functionality, e. g. for std::for_each(), ...).
|
* desired functionality, e. g. for std::for_each(), ...).
|
||||||
* @param r Functor to "add" a single result to the already
|
* @param r Functor to "add" a single result to the already
|
||||||
* processed elements (depends on functionality).
|
* processed elements (depends on functionality).
|
||||||
* @param base Base value for reduction.
|
* @param base Base value for reduction.
|
||||||
* @param output Pointer to position where final result is written to
|
* @param output Pointer to position where final result is written to
|
||||||
* @param bound Maximum number of elements processed (e. g. for
|
* @param bound Maximum number of elements processed (e. g. for
|
||||||
* std::count_n()).
|
* std::count_n()).
|
||||||
* @return User-supplied functor (that may contain a part of the result).
|
* @return User-supplied functor (that may contain a part of the result).
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
|
template<typename RandomAccessIterator,
|
||||||
|
typename Op,
|
||||||
|
typename Fu,
|
||||||
|
typename Red,
|
||||||
|
typename Result>
|
||||||
Op
|
Op
|
||||||
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
|
for_each_template_random_access_omp_loop(
|
||||||
|
RandomAccessIterator begin,
|
||||||
|
RandomAccessIterator end,
|
||||||
|
Op o, Fu& f, Red r, Result base, Result& output,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::
|
||||||
|
difference_type bound)
|
||||||
{
|
{
|
||||||
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
|
typedef typename
|
||||||
|
std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
|
difference_type;
|
||||||
|
|
||||||
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
|
|
||||||
Result *thread_results = new Result[num_threads];
|
|
||||||
difference_type length = end - begin;
|
difference_type length = end - begin;
|
||||||
|
thread_index_t num_threads =
|
||||||
|
__gnu_parallel::min<difference_type>(get_max_threads(), length);
|
||||||
|
|
||||||
|
Result *thread_results;
|
||||||
|
|
||||||
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
thread_results = new Result[num_threads];
|
||||||
|
|
||||||
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
|
thread_results[i] = Result();
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
|
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
|
||||||
|
for (difference_type pos = 0; pos < length; pos++)
|
||||||
|
thread_results[iam] =
|
||||||
|
r(thread_results[iam], f(o, begin+pos));
|
||||||
|
} //parallel
|
||||||
|
|
||||||
for (thread_index_t i = 0; i < num_threads; i++)
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
{
|
output = r(output, thread_results[i]);
|
||||||
thread_results[i] = r(thread_results[i], f(o, begin+i));
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
|
||||||
{
|
|
||||||
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
|
|
||||||
for (difference_type pos = 0; pos < length; pos++)
|
|
||||||
{
|
|
||||||
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (thread_index_t i = 0; i < num_threads; i++)
|
|
||||||
{
|
|
||||||
output = r(output, thread_results[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete [] thread_results;
|
delete [] thread_results;
|
||||||
|
|
||||||
|
|
@ -100,6 +117,7 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace
|
} // end namespace
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -64,39 +64,50 @@ namespace __gnu_parallel
|
||||||
* std::count_n()).
|
* std::count_n()).
|
||||||
* @return User-supplied functor (that may contain a part of the result).
|
* @return User-supplied functor (that may contain a part of the result).
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
|
template<typename RandomAccessIterator,
|
||||||
|
typename Op,
|
||||||
|
typename Fu,
|
||||||
|
typename Red,
|
||||||
|
typename Result>
|
||||||
Op
|
Op
|
||||||
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
|
for_each_template_random_access_omp_loop_static(
|
||||||
RandomAccessIterator end,
|
RandomAccessIterator begin,
|
||||||
Op o, Fu& f, Red r,
|
RandomAccessIterator end,
|
||||||
Result base, Result& output,
|
Op o, Fu& f, Red r, Result base, Result& output,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
|
typename std::iterator_traits<RandomAccessIterator>::
|
||||||
|
difference_type bound)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef typename
|
||||||
typedef typename traits_type::difference_type difference_type;
|
std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
|
difference_type;
|
||||||
|
|
||||||
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
|
|
||||||
Result *thread_results = new Result[num_threads];
|
|
||||||
difference_type length = end - begin;
|
difference_type length = end - begin;
|
||||||
|
thread_index_t num_threads =
|
||||||
|
std::min<difference_type>(get_max_threads(), length);
|
||||||
|
|
||||||
|
Result *thread_results;
|
||||||
|
|
||||||
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
thread_results = new Result[num_threads];
|
||||||
|
|
||||||
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
|
thread_results[i] = Result();
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
|
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
|
||||||
|
for (difference_type pos = 0; pos < length; pos++)
|
||||||
|
thread_results[iam] =
|
||||||
|
r(thread_results[iam], f(o, begin+pos));
|
||||||
|
} //parallel
|
||||||
|
|
||||||
for (thread_index_t i = 0; i < num_threads; i++)
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
{
|
output = r(output, thread_results[i]);
|
||||||
thread_results[i] = r(thread_results[i], f(o, begin+i));
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
|
||||||
{
|
|
||||||
#pragma omp for schedule(static, Settings::workstealing_chunk_size)
|
|
||||||
for (difference_type pos = 0; pos < length; pos++)
|
|
||||||
{
|
|
||||||
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (thread_index_t i = 0; i < num_threads; i++)
|
|
||||||
{
|
|
||||||
output = r(output, thread_results[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete [] thread_results;
|
delete [] thread_results;
|
||||||
|
|
||||||
|
|
@ -106,6 +117,7 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace
|
} // end namespace
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -41,69 +41,80 @@
|
||||||
|
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <parallel/settings.h>
|
#include <parallel/settings.h>
|
||||||
|
#include <parallel/base.h>
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
|
|
||||||
/** @brief Embarrassingly parallel algorithm for random access
|
/** @brief Embarrassingly parallel algorithm for random access
|
||||||
* iterators, using hand-crafted parallelization by equal splitting
|
* iterators, using hand-crafted parallelization by equal splitting
|
||||||
* the work.
|
* the work.
|
||||||
*
|
*
|
||||||
* @param begin Begin iterator of element sequence.
|
* @param begin Begin iterator of element sequence.
|
||||||
* @param end End iterator of element sequence.
|
* @param end End iterator of element sequence.
|
||||||
* @param o User-supplied functor (comparator, predicate, adding
|
* @param o User-supplied functor (comparator, predicate, adding
|
||||||
* functor, ...)
|
* functor, ...)
|
||||||
* @param f Functor to "process" an element with op (depends on
|
* @param f Functor to "process" an element with op (depends on
|
||||||
* desired functionality, e. g. for std::for_each(), ...).
|
* desired functionality, e. g. for std::for_each(), ...).
|
||||||
* @param r Functor to "add" a single result to the already
|
* @param r Functor to "add" a single result to the already
|
||||||
* processed elements (depends on functionality).
|
* processed elements (depends on functionality).
|
||||||
* @param base Base value for reduction.
|
* @param base Base value for reduction.
|
||||||
* @param output Pointer to position where final result is written to
|
* @param output Pointer to position where final result is written to
|
||||||
* @param bound Maximum number of elements processed (e. g. for
|
* @param bound Maximum number of elements processed (e. g. for
|
||||||
* std::count_n()).
|
* std::count_n()).
|
||||||
* @return User-supplied functor (that may contain a part of the result).
|
* @return User-supplied functor (that may contain a part of the result).
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
|
template<
|
||||||
|
typename RandomAccessIterator,
|
||||||
|
typename Op,
|
||||||
|
typename Fu,
|
||||||
|
typename Red,
|
||||||
|
typename Result>
|
||||||
Op
|
Op
|
||||||
for_each_template_random_access_ed(RandomAccessIterator begin,
|
for_each_template_random_access_ed(
|
||||||
RandomAccessIterator end, Op o, Fu& f,
|
RandomAccessIterator begin,
|
||||||
Red r, Result base, Result& output,
|
RandomAccessIterator end,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
|
Op o, Fu& f, Red r, Result base, Result& output,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::
|
||||||
|
difference_type bound)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
const difference_type length = end - begin;
|
const difference_type length = end - begin;
|
||||||
const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
|
Result *thread_results;
|
||||||
const difference_type dmin = settings_threads < length ? settings_threads : length;
|
|
||||||
const difference_type dmax = dmin > 1 ? dmin : 1;
|
|
||||||
|
|
||||||
thread_index_t num_threads = static_cast<thread_index_t>(dmax);
|
thread_index_t num_threads =
|
||||||
|
__gnu_parallel::min<difference_type>(get_max_threads(), length);
|
||||||
|
|
||||||
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
thread_results = new Result[num_threads];
|
||||||
|
}
|
||||||
|
|
||||||
Result *thread_results = new Result[num_threads];
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
// Neutral element.
|
||||||
{
|
Result reduct = Result();
|
||||||
// Neutral element.
|
|
||||||
Result reduct = Result();
|
|
||||||
|
|
||||||
thread_index_t p = num_threads;
|
difference_type
|
||||||
thread_index_t iam = omp_get_thread_num();
|
start = equally_split_point(length, num_threads, iam),
|
||||||
difference_type start = iam * length / p;
|
stop = equally_split_point(length, num_threads, iam + 1);
|
||||||
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
|
|
||||||
|
|
||||||
if (start < limit)
|
if (start < stop)
|
||||||
{
|
{
|
||||||
reduct = f(o, begin + start);
|
reduct = f(o, begin + start);
|
||||||
start++;
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; start < limit; start++)
|
for (; start < stop; ++start)
|
||||||
reduct = r(reduct, f(o, begin + start));
|
reduct = r(reduct, f(o, begin + start));
|
||||||
|
|
||||||
thread_results[iam] = reduct;
|
thread_results[iam] = reduct;
|
||||||
}
|
} //parallel
|
||||||
|
|
||||||
for (thread_index_t i = 0; i < num_threads; i++)
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
output = r(output, thread_results[i]);
|
output = r(output, thread_results[i]);
|
||||||
|
|
|
||||||
|
|
@ -48,130 +48,156 @@ namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
// Problem: there is no 0-element given.
|
// Problem: there is no 0-element given.
|
||||||
|
|
||||||
/** @brief Base case prefix sum routine.
|
/** @brief Base case prefix sum routine.
|
||||||
* @param begin Begin iterator of input sequence.
|
* @param begin Begin iterator of input sequence.
|
||||||
* @param end End iterator of input sequence.
|
* @param end End iterator of input sequence.
|
||||||
* @param result Begin iterator of output sequence.
|
* @param result Begin iterator of output sequence.
|
||||||
* @param bin_op Associative binary function.
|
* @param bin_op Associative binary function.
|
||||||
* @param value Start value. Must be passed since the neutral
|
* @param value Start value. Must be passed since the neutral
|
||||||
* element is unknown in general.
|
* element is unknown in general.
|
||||||
* @return End iterator of output sequence. */
|
* @return End iterator of output sequence. */
|
||||||
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename BinaryOperation>
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
|
parallel_partial_sum_basecase(
|
||||||
OutputIterator result, BinaryOperation bin_op,
|
InputIterator begin, InputIterator end,
|
||||||
typename std::iterator_traits<InputIterator>::value_type value)
|
OutputIterator result, BinaryOperation bin_op,
|
||||||
|
typename std::iterator_traits<InputIterator>::value_type value)
|
||||||
{
|
{
|
||||||
if (begin == end)
|
if (begin == end)
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
while (begin != end)
|
while (begin != end)
|
||||||
{
|
{
|
||||||
value = bin_op(value, *begin);
|
value = bin_op(value, *begin);
|
||||||
*result = value;
|
*result = value;
|
||||||
result++;
|
result++;
|
||||||
begin++;
|
begin++;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Parallel partial sum implementation, two-phase approach,
|
/** @brief Parallel partial sum implementation, two-phase approach,
|
||||||
no recursion.
|
no recursion.
|
||||||
* @param begin Begin iterator of input sequence.
|
* @param begin Begin iterator of input sequence.
|
||||||
* @param end End iterator of input sequence.
|
* @param end End iterator of input sequence.
|
||||||
* @param result Begin iterator of output sequence.
|
* @param result Begin iterator of output sequence.
|
||||||
* @param bin_op Associative binary function.
|
* @param bin_op Associative binary function.
|
||||||
* @param n Length of sequence.
|
* @param n Length of sequence.
|
||||||
* @param num_threads Number of threads to use.
|
* @param num_threads Number of threads to use.
|
||||||
* @return End iterator of output sequence.
|
* @return End iterator of output sequence.
|
||||||
*/
|
*/
|
||||||
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename BinaryOperation>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_partial_sum_linear(InputIterator begin, InputIterator end,
|
parallel_partial_sum_linear(
|
||||||
OutputIterator result, BinaryOperation bin_op,
|
InputIterator begin, InputIterator end,
|
||||||
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads)
|
OutputIterator result, BinaryOperation bin_op,
|
||||||
|
typename std::iterator_traits<InputIterator>::difference_type n)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
if (num_threads > (n - 1))
|
thread_index_t num_threads =
|
||||||
num_threads = static_cast<thread_index_t>(n - 1);
|
std::min<difference_type>(get_max_threads(), n - 1);
|
||||||
|
|
||||||
if (num_threads < 2)
|
if (num_threads < 2)
|
||||||
{
|
{
|
||||||
*result = *begin;
|
*result = *begin;
|
||||||
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin);
|
return parallel_partial_sum_basecase(
|
||||||
|
begin + 1, end, result + 1, bin_op, *begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2)));
|
difference_type* borders;
|
||||||
|
value_type* sums;
|
||||||
|
|
||||||
if (Settings::partial_sum_dilatation == 1.0f)
|
# pragma omp parallel num_threads(num_threads)
|
||||||
equally_split(n, num_threads + 1, borders);
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length;
|
# pragma omp single
|
||||||
borders[0] = 0;
|
{
|
||||||
for (int i = 1; i < (num_threads + 1); i++)
|
num_threads = omp_get_num_threads();
|
||||||
{
|
|
||||||
borders[i] = borderstart;
|
|
||||||
borderstart += chunk_length;
|
|
||||||
}
|
|
||||||
borders[num_threads + 1] = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads));
|
borders = new difference_type[num_threads + 2];
|
||||||
OutputIterator target_end;
|
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
if (Settings::partial_sum_dilatation == 1.0f)
|
||||||
{
|
equally_split(n, num_threads + 1, borders);
|
||||||
int id = omp_get_thread_num();
|
else
|
||||||
if (id == 0)
|
{
|
||||||
{
|
difference_type chunk_length =
|
||||||
*result = *begin;
|
((double)n /
|
||||||
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
|
((double)num_threads + Settings::partial_sum_dilatation)),
|
||||||
result + 1, bin_op, *begin);
|
borderstart = n - num_threads * chunk_length;
|
||||||
sums[0] = *(result + borders[1] - 1);
|
borders[0] = 0;
|
||||||
}
|
for (int i = 1; i < (num_threads + 1); i++)
|
||||||
else
|
{
|
||||||
{
|
borders[i] = borderstart;
|
||||||
sums[id] = std::accumulate(begin + borders[id] + 1,
|
borderstart += chunk_length;
|
||||||
begin + borders[id + 1],
|
}
|
||||||
*(begin + borders[id]),
|
borders[num_threads + 1] = n;
|
||||||
bin_op, __gnu_parallel::sequential_tag());
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp barrier
|
sums = static_cast<value_type*>(
|
||||||
|
::operator new(sizeof(value_type) * num_threads));
|
||||||
|
OutputIterator target_end;
|
||||||
|
} //single
|
||||||
|
|
||||||
#pragma omp single
|
int iam = omp_get_thread_num();
|
||||||
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1,
|
if (iam == 0)
|
||||||
bin_op, sums[0]);
|
{
|
||||||
|
*result = *begin;
|
||||||
|
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
|
||||||
|
result + 1, bin_op, *begin);
|
||||||
|
sums[0] = *(result + borders[1] - 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sums[iam] = std::accumulate(begin + borders[iam] + 1,
|
||||||
|
begin + borders[iam + 1],
|
||||||
|
*(begin + borders[iam]),
|
||||||
|
bin_op, __gnu_parallel::sequential_tag());
|
||||||
|
}
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
// Still same team.
|
# pragma omp single
|
||||||
parallel_partial_sum_basecase(begin + borders[id + 1],
|
parallel_partial_sum_basecase(
|
||||||
begin + borders[id + 2],
|
sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
|
||||||
result + borders[id + 1], bin_op,
|
|
||||||
sums[id]);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete [] sums;
|
# pragma omp barrier
|
||||||
|
|
||||||
|
// Still same team.
|
||||||
|
parallel_partial_sum_basecase(begin + borders[iam + 1],
|
||||||
|
begin + borders[iam + 2],
|
||||||
|
result + borders[iam + 1], bin_op,
|
||||||
|
sums[iam]);
|
||||||
|
} //parallel
|
||||||
|
|
||||||
|
delete[] sums;
|
||||||
|
delete[] borders;
|
||||||
|
|
||||||
return result + n;
|
return result + n;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Parallel partial sum front-end.
|
/** @brief Parallel partial sum front-end.
|
||||||
* @param begin Begin iterator of input sequence.
|
* @param begin Begin iterator of input sequence.
|
||||||
* @param end End iterator of input sequence.
|
* @param end End iterator of input sequence.
|
||||||
* @param result Begin iterator of output sequence.
|
* @param result Begin iterator of output sequence.
|
||||||
* @param bin_op Associative binary function.
|
* @param bin_op Associative binary function.
|
||||||
* @return End iterator of output sequence. */
|
* @return End iterator of output sequence. */
|
||||||
template<typename InputIterator, typename OutputIterator, typename BinaryOperation>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename BinaryOperation>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_partial_sum(InputIterator begin, InputIterator end,
|
parallel_partial_sum(InputIterator begin, InputIterator end,
|
||||||
OutputIterator result, BinaryOperation bin_op)
|
OutputIterator result, BinaryOperation bin_op)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(begin - end);
|
_GLIBCXX_CALL(begin - end)
|
||||||
|
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -179,18 +205,15 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
|
|
||||||
int num_threads = get_max_threads();
|
|
||||||
|
|
||||||
switch (Settings::partial_sum_algorithm)
|
switch (Settings::partial_sum_algorithm)
|
||||||
{
|
{
|
||||||
case Settings::LINEAR:
|
case Settings::LINEAR:
|
||||||
// Need an initial offset.
|
// Need an initial offset.
|
||||||
return parallel_partial_sum_linear(begin, end, result, bin_op,
|
return parallel_partial_sum_linear(begin, end, result, bin_op, n);
|
||||||
n, num_threads);
|
|
||||||
default:
|
default:
|
||||||
// Partial_sum algorithm not implemented.
|
// Partial_sum algorithm not implemented.
|
||||||
_GLIBCXX_PARALLEL_ASSERT(0);
|
_GLIBCXX_PARALLEL_ASSERT(0);
|
||||||
return result + n;
|
return result + n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -45,21 +45,21 @@
|
||||||
#include <bits/stl_algo.h>
|
#include <bits/stl_algo.h>
|
||||||
#include <parallel/parallel.h>
|
#include <parallel/parallel.h>
|
||||||
|
|
||||||
/** @brief Decide whether to declare certain variable volatile in this file. */
|
/** @brief Decide whether to declare certain variables volatile. */
|
||||||
#define _GLIBCXX_VOLATILE volatile
|
#define _GLIBCXX_VOLATILE volatile
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/** @brief Parallel implementation of std::partition.
|
/** @brief Parallel implementation of std::partition.
|
||||||
* @param begin Begin iterator of input sequence to split.
|
* @param begin Begin iterator of input sequence to split.
|
||||||
* @param end End iterator of input sequence to split.
|
* @param end End iterator of input sequence to split.
|
||||||
* @param pred Partition predicate, possibly including some kind of pivot.
|
* @param pred Partition predicate, possibly including some kind of pivot.
|
||||||
* @param max_num_threads Maximum number of threads to use for this task.
|
* @param num_threads Maximum number of threads to use for this task.
|
||||||
* @return Number of elements not fulfilling the predicate. */
|
* @return Number of elements not fulfilling the predicate. */
|
||||||
template<typename RandomAccessIterator, typename Predicate>
|
template<typename RandomAccessIterator, typename Predicate>
|
||||||
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
|
typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
|
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
|
||||||
Predicate pred, thread_index_t max_num_threads)
|
Predicate pred, thread_index_t num_threads)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -74,212 +74,238 @@ namespace __gnu_parallel
|
||||||
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
|
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
|
||||||
_GLIBCXX_VOLATILE difference_type leftnew, rightnew;
|
_GLIBCXX_VOLATILE difference_type leftnew, rightnew;
|
||||||
|
|
||||||
bool* reserved_left, * reserved_right;
|
bool* reserved_left = NULL, * reserved_right = NULL;
|
||||||
|
|
||||||
reserved_left = new bool[max_num_threads];
|
|
||||||
reserved_right = new bool[max_num_threads];
|
|
||||||
|
|
||||||
difference_type chunk_size;
|
difference_type chunk_size;
|
||||||
if (Settings::partition_chunk_share > 0.0)
|
|
||||||
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
|
|
||||||
else
|
|
||||||
chunk_size = Settings::partition_chunk_size;
|
|
||||||
|
|
||||||
omp_lock_t result_lock;
|
omp_lock_t result_lock;
|
||||||
omp_init_lock(&result_lock);
|
omp_init_lock(&result_lock);
|
||||||
|
|
||||||
// At least good for two processors.
|
//at least two chunks per thread
|
||||||
while (right - left + 1 >= 2 * max_num_threads * chunk_size)
|
if(right - left + 1 >= 2 * num_threads * chunk_size)
|
||||||
|
# pragma omp parallel num_threads(num_threads)
|
||||||
{
|
{
|
||||||
difference_type num_chunks = (right - left + 1) / chunk_size;
|
# pragma omp single
|
||||||
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2);
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
reserved_left = new bool[num_threads];
|
||||||
|
reserved_right = new bool[num_threads];
|
||||||
|
|
||||||
for (int r = 0; r < num_threads; r++)
|
if (Settings::partition_chunk_share > 0.0)
|
||||||
{
|
chunk_size = std::max<difference_type>(
|
||||||
reserved_left[r] = false;
|
Settings::partition_chunk_size,
|
||||||
reserved_right[r] = false;
|
(double)n * Settings::partition_chunk_share /
|
||||||
}
|
(double)num_threads);
|
||||||
leftover_left = 0;
|
else
|
||||||
leftover_right = 0;
|
chunk_size = Settings::partition_chunk_size;
|
||||||
|
}
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
while (right - left + 1 >= 2 * num_threads * chunk_size)
|
||||||
{
|
{
|
||||||
// Private.
|
# pragma omp single
|
||||||
difference_type thread_left, thread_left_border, thread_right, thread_right_border;
|
{
|
||||||
thread_left = left + 1;
|
difference_type num_chunks = (right - left + 1) / chunk_size;
|
||||||
|
|
||||||
// Just to satisfy the condition below.
|
for (int r = 0; r < num_threads; r++)
|
||||||
thread_left_border = thread_left - 1;
|
{
|
||||||
thread_right = n - 1;
|
reserved_left[r] = false;
|
||||||
thread_right_border = thread_right + 1;
|
reserved_right[r] = false;
|
||||||
|
}
|
||||||
|
leftover_left = 0;
|
||||||
|
leftover_right = 0;
|
||||||
|
} //implicit barrier
|
||||||
|
|
||||||
bool iam_finished = false;
|
// Private.
|
||||||
while (!iam_finished)
|
difference_type thread_left, thread_left_border,
|
||||||
{
|
thread_right, thread_right_border;
|
||||||
if (thread_left > thread_left_border)
|
thread_left = left + 1;
|
||||||
{
|
|
||||||
omp_set_lock(&result_lock);
|
|
||||||
if (left + (chunk_size - 1) > right)
|
|
||||||
iam_finished = true;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
thread_left = left;
|
|
||||||
thread_left_border = left + (chunk_size - 1);
|
|
||||||
left += chunk_size;
|
|
||||||
}
|
|
||||||
omp_unset_lock(&result_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (thread_right < thread_right_border)
|
// Just to satisfy the condition below.
|
||||||
{
|
thread_left_border = thread_left - 1;
|
||||||
omp_set_lock(&result_lock);
|
thread_right = n - 1;
|
||||||
if (left > right - (chunk_size - 1))
|
thread_right_border = thread_right + 1;
|
||||||
iam_finished = true;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
thread_right = right;
|
|
||||||
thread_right_border = right - (chunk_size - 1);
|
|
||||||
right -= chunk_size;
|
|
||||||
}
|
|
||||||
omp_unset_lock(&result_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (iam_finished)
|
bool iam_finished = false;
|
||||||
break;
|
while (!iam_finished)
|
||||||
|
{
|
||||||
|
if (thread_left > thread_left_border)
|
||||||
|
{
|
||||||
|
omp_set_lock(&result_lock);
|
||||||
|
if (left + (chunk_size - 1) > right)
|
||||||
|
iam_finished = true;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
thread_left = left;
|
||||||
|
thread_left_border = left + (chunk_size - 1);
|
||||||
|
left += chunk_size;
|
||||||
|
}
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
}
|
||||||
|
|
||||||
// Swap as usual.
|
if (thread_right < thread_right_border)
|
||||||
while (thread_left < thread_right)
|
{
|
||||||
{
|
omp_set_lock(&result_lock);
|
||||||
while (pred(begin[thread_left]) && thread_left <= thread_left_border)
|
if (left > right - (chunk_size - 1))
|
||||||
thread_left++;
|
iam_finished = true;
|
||||||
while (!pred(begin[thread_right]) && thread_right >= thread_right_border)
|
else
|
||||||
thread_right--;
|
{
|
||||||
|
thread_right = right;
|
||||||
|
thread_right_border = right - (chunk_size - 1);
|
||||||
|
right -= chunk_size;
|
||||||
|
}
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
}
|
||||||
|
|
||||||
if (thread_left > thread_left_border || thread_right < thread_right_border)
|
if (iam_finished)
|
||||||
// Fetch new chunk(s).
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
std::swap(begin[thread_left], begin[thread_right]);
|
// Swap as usual.
|
||||||
thread_left++;
|
while (thread_left < thread_right)
|
||||||
thread_right--;
|
{
|
||||||
}
|
while (pred(begin[thread_left])
|
||||||
}
|
&& thread_left <= thread_left_border)
|
||||||
|
thread_left++;
|
||||||
|
while (!pred(begin[thread_right])
|
||||||
|
&& thread_right >= thread_right_border)
|
||||||
|
thread_right--;
|
||||||
|
|
||||||
// Now swap the leftover chunks to the right places.
|
if (thread_left > thread_left_border
|
||||||
if (thread_left <= thread_left_border)
|
|| thread_right < thread_right_border)
|
||||||
#pragma omp atomic
|
// Fetch new chunk(s).
|
||||||
leftover_left++;
|
break;
|
||||||
if (thread_right >= thread_right_border)
|
|
||||||
#pragma omp atomic
|
|
||||||
leftover_right++;
|
|
||||||
|
|
||||||
#pragma omp barrier
|
std::swap(begin[thread_left], begin[thread_right]);
|
||||||
|
thread_left++;
|
||||||
|
thread_right--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#pragma omp single
|
// Now swap the leftover chunks to the right places.
|
||||||
{
|
if (thread_left <= thread_left_border)
|
||||||
leftnew = left - leftover_left * chunk_size;
|
# pragma omp atomic
|
||||||
rightnew = right + leftover_right * chunk_size;
|
leftover_left++;
|
||||||
}
|
if (thread_right >= thread_right_border)
|
||||||
|
# pragma omp atomic
|
||||||
|
leftover_right++;
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
// <=> thread_left_border + (chunk_size - 1) >= leftnew
|
# pragma omp single
|
||||||
if (thread_left <= thread_left_border
|
{
|
||||||
&& thread_left_border >= leftnew)
|
leftnew = left - leftover_left * chunk_size;
|
||||||
{
|
rightnew = right + leftover_right * chunk_size;
|
||||||
// Chunk already in place, reserve spot.
|
}
|
||||||
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// <=> thread_right_border - (chunk_size - 1) <= rightnew
|
# pragma omp barrier
|
||||||
if (thread_right >= thread_right_border
|
|
||||||
&& thread_right_border <= rightnew)
|
|
||||||
{
|
|
||||||
// Chunk already in place, reserve spot.
|
|
||||||
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp barrier
|
// <=> thread_left_border + (chunk_size - 1) >= leftnew
|
||||||
|
if (thread_left <= thread_left_border
|
||||||
|
&& thread_left_border >= leftnew)
|
||||||
|
{
|
||||||
|
// Chunk already in place, reserve spot.
|
||||||
|
reserved_left[(left - (thread_left_border + 1)) / chunk_size]
|
||||||
|
= true;
|
||||||
|
}
|
||||||
|
|
||||||
if (thread_left <= thread_left_border && thread_left_border < leftnew)
|
// <=> thread_right_border - (chunk_size - 1) <= rightnew
|
||||||
{
|
if (thread_right >= thread_right_border
|
||||||
// Find spot and swap.
|
&& thread_right_border <= rightnew)
|
||||||
difference_type swapstart = -1;
|
{
|
||||||
omp_set_lock(&result_lock);
|
// Chunk already in place, reserve spot.
|
||||||
for (int r = 0; r < leftover_left; r++)
|
reserved_right
|
||||||
|
[((thread_right_border - 1) - right) / chunk_size]
|
||||||
|
= true;
|
||||||
|
}
|
||||||
|
|
||||||
|
# pragma omp barrier
|
||||||
|
|
||||||
|
if (thread_left <= thread_left_border
|
||||||
|
&& thread_left_border < leftnew)
|
||||||
|
{
|
||||||
|
// Find spot and swap.
|
||||||
|
difference_type swapstart = -1;
|
||||||
|
omp_set_lock(&result_lock);
|
||||||
|
for (int r = 0; r < leftover_left; r++)
|
||||||
if (!reserved_left[r])
|
if (!reserved_left[r])
|
||||||
{
|
{
|
||||||
reserved_left[r] = true;
|
reserved_left[r] = true;
|
||||||
swapstart = left - (r + 1) * chunk_size;
|
swapstart = left - (r + 1) * chunk_size;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
omp_unset_lock(&result_lock);
|
omp_unset_lock(&result_lock);
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
|
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart);
|
std::swap_ranges(
|
||||||
}
|
begin + thread_left_border - (chunk_size - 1),
|
||||||
|
begin + thread_left_border + 1,
|
||||||
|
begin + swapstart);
|
||||||
|
}
|
||||||
|
|
||||||
if (thread_right >= thread_right_border
|
if (thread_right >= thread_right_border
|
||||||
&& thread_right_border > rightnew)
|
&& thread_right_border > rightnew)
|
||||||
{
|
{
|
||||||
// Find spot and swap
|
// Find spot and swap
|
||||||
difference_type swapstart = -1;
|
difference_type swapstart = -1;
|
||||||
omp_set_lock(&result_lock);
|
omp_set_lock(&result_lock);
|
||||||
for (int r = 0; r < leftover_right; r++)
|
for (int r = 0; r < leftover_right; r++)
|
||||||
if (!reserved_right[r])
|
if (!reserved_right[r])
|
||||||
{
|
{
|
||||||
reserved_right[r] = true;
|
reserved_right[r] = true;
|
||||||
swapstart = right + r * chunk_size + 1;
|
swapstart = right + r * chunk_size + 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
omp_unset_lock(&result_lock);
|
omp_unset_lock(&result_lock);
|
||||||
|
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
|
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart);
|
std::swap_ranges(begin + thread_right_border,
|
||||||
}
|
begin + thread_right_border + chunk_size,
|
||||||
|
begin + swapstart);
|
||||||
|
}
|
||||||
#if _GLIBCXX_ASSERTIONS
|
#if _GLIBCXX_ASSERTIONS
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
#pragma omp single
|
# pragma omp single
|
||||||
{
|
{
|
||||||
for (int r = 0; r < leftover_left; r++)
|
for (int r = 0; r < leftover_left; r++)
|
||||||
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
|
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
|
||||||
for (int r = 0; r < leftover_right; r++)
|
for (int r = 0; r < leftover_right; r++)
|
||||||
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
|
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
left = leftnew;
|
|
||||||
right = rightnew;
|
left = leftnew;
|
||||||
}
|
right = rightnew;
|
||||||
} // end "recursion"
|
}
|
||||||
|
# pragma omp flush(left, right)
|
||||||
|
} // end "recursion" //parallel
|
||||||
|
|
||||||
difference_type final_left = left, final_right = right;
|
difference_type final_left = left, final_right = right;
|
||||||
|
|
||||||
while (final_left < final_right)
|
while (final_left < final_right)
|
||||||
{
|
{
|
||||||
// Go right until key is geq than pivot.
|
// Go right until key is geq than pivot.
|
||||||
while (pred(begin[final_left]) && final_left < final_right)
|
while (pred(begin[final_left]) && final_left < final_right)
|
||||||
final_left++;
|
final_left++;
|
||||||
|
|
||||||
// Go left until key is less than pivot.
|
// Go left until key is less than pivot.
|
||||||
while (!pred(begin[final_right]) && final_left < final_right)
|
while (!pred(begin[final_right]) && final_left < final_right)
|
||||||
final_right--;
|
final_right--;
|
||||||
|
|
||||||
if (final_left == final_right)
|
if (final_left == final_right)
|
||||||
break;
|
break;
|
||||||
std::swap(begin[final_left], begin[final_right]);
|
std::swap(begin[final_left], begin[final_right]);
|
||||||
final_left++;
|
final_left++;
|
||||||
final_right--;
|
final_right--;
|
||||||
}
|
}
|
||||||
|
|
||||||
// All elements on the left side are < piv, all elements on the
|
// All elements on the left side are < piv, all elements on the
|
||||||
|
|
@ -298,14 +324,14 @@ namespace __gnu_parallel
|
||||||
return final_left + 1;
|
return final_left + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parallel implementation of std::nth_element().
|
* @brief Parallel implementation of std::nth_element().
|
||||||
* @param begin Begin iterator of input sequence.
|
* @param begin Begin iterator of input sequence.
|
||||||
* @param nth Iterator of element that must be in position afterwards.
|
* @param nth Iterator of element that must be in position afterwards.
|
||||||
* @param end End iterator of input sequence.
|
* @param end End iterator of input sequence.
|
||||||
* @param comp Comparator.
|
* @param comp Comparator.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
void
|
void
|
||||||
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
|
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
|
||||||
RandomAccessIterator end, Comparator comp)
|
RandomAccessIterator end, Comparator comp)
|
||||||
|
|
@ -324,65 +350,65 @@ namespace __gnu_parallel
|
||||||
// Break if input range to small.
|
// Break if input range to small.
|
||||||
while (static_cast<sequence_index_t>(end - begin) >= minimum_length)
|
while (static_cast<sequence_index_t>(end - begin) >= minimum_length)
|
||||||
{
|
{
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
|
|
||||||
RandomAccessIterator pivot_pos = begin + rng(n);
|
RandomAccessIterator pivot_pos = begin + rng(n);
|
||||||
|
|
||||||
// Swap pivot_pos value to end.
|
// Swap pivot_pos value to end.
|
||||||
if (pivot_pos != (end - 1))
|
if (pivot_pos != (end - 1))
|
||||||
std::swap(*pivot_pos, *(end - 1));
|
std::swap(*pivot_pos, *(end - 1));
|
||||||
pivot_pos = end - 1;
|
pivot_pos = end - 1;
|
||||||
|
|
||||||
// XXX Comparator must have first_value_type, second_value_type, result_type
|
// XXX Comparator must have first_value_type, second_value_type, result_type
|
||||||
// Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> >
|
// Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> >
|
||||||
// pivot_pos == std::pair<S, int>*
|
// pivot_pos == std::pair<S, int>*
|
||||||
// XXX binder2nd only for RandomAccessIterators??
|
// XXX binder2nd only for RandomAccessIterators??
|
||||||
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
|
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
|
||||||
|
|
||||||
// Divide, leave pivot unchanged in last place.
|
// Divide, leave pivot unchanged in last place.
|
||||||
RandomAccessIterator split_pos1, split_pos2;
|
RandomAccessIterator split_pos1, split_pos2;
|
||||||
split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
|
split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
|
||||||
|
|
||||||
// Left side: < pivot_pos; right side: >= pivot_pos
|
// Left side: < pivot_pos; right side: >= pivot_pos
|
||||||
|
|
||||||
// Swap pivot back to middle.
|
// Swap pivot back to middle.
|
||||||
if (split_pos1 != pivot_pos)
|
if (split_pos1 != pivot_pos)
|
||||||
std::swap(*split_pos1, *pivot_pos);
|
std::swap(*split_pos1, *pivot_pos);
|
||||||
pivot_pos = split_pos1;
|
pivot_pos = split_pos1;
|
||||||
|
|
||||||
// In case all elements are equal, split_pos1 == 0
|
// In case all elements are equal, split_pos1 == 0
|
||||||
if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
|
if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
|
||||||
{
|
{
|
||||||
// Very unequal split, one part smaller than one 128th
|
// Very unequal split, one part smaller than one 128th
|
||||||
// elements not stricly larger than the pivot.
|
// elements not stricly larger than the pivot.
|
||||||
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
|
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
|
||||||
|
|
||||||
// Find other end of pivot-equal range.
|
// Find other end of pivot-equal range.
|
||||||
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
|
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
// Only skip the pivot.
|
// Only skip the pivot.
|
||||||
split_pos2 = split_pos1 + 1;
|
split_pos2 = split_pos1 + 1;
|
||||||
|
|
||||||
// Compare iterators.
|
// Compare iterators.
|
||||||
if (split_pos2 <= nth)
|
if (split_pos2 <= nth)
|
||||||
begin = split_pos2;
|
begin = split_pos2;
|
||||||
else if (nth < split_pos1)
|
else if (nth < split_pos1)
|
||||||
end = split_pos1;
|
end = split_pos1;
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only at most Settings::partition_minimal_n elements left.
|
// Only at most Settings::partition_minimal_n elements left.
|
||||||
__gnu_sequential::sort(begin, end, comp);
|
__gnu_sequential::sort(begin, end, comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Parallel implementation of std::partial_sort().
|
/** @brief Parallel implementation of std::partial_sort().
|
||||||
* @param begin Begin iterator of input sequence.
|
* @param begin Begin iterator of input sequence.
|
||||||
* @param middle Sort until this position.
|
* @param middle Sort until this position.
|
||||||
* @param end End iterator of input sequence.
|
* @param end End iterator of input sequence.
|
||||||
* @param comp Comparator. */
|
* @param comp Comparator. */
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
void
|
void
|
||||||
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
|
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
|
||||||
{
|
{
|
||||||
|
|
@ -390,7 +416,7 @@ namespace __gnu_parallel
|
||||||
std::sort(begin, middle, comp);
|
std::sort(begin, middle, comp);
|
||||||
}
|
}
|
||||||
|
|
||||||
} //namespace __gnu_parallel
|
} //namespace __gnu_parallel
|
||||||
|
|
||||||
#undef _GLIBCXX_VOLATILE
|
#undef _GLIBCXX_VOLATILE
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,11 +53,17 @@ namespace __gnu_parallel
|
||||||
* this part.
|
* this part.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline typename std::iterator_traits<RandomAccessIterator>::difference_type
|
inline
|
||||||
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
|
typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
Comparator comp,
|
parallel_sort_qs_divide(
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
|
RandomAccessIterator begin,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
|
RandomAccessIterator end,
|
||||||
|
Comparator comp,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
|
pivot_rank,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
|
num_samples,
|
||||||
|
thread_index_t num_threads)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -65,20 +71,24 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
difference_type n = end - begin;
|
difference_type n = end - begin;
|
||||||
num_samples = std::min(num_samples, n);
|
num_samples = std::min(num_samples, n);
|
||||||
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
|
|
||||||
|
// Allocate uninitialized, to avoid default constructor.
|
||||||
|
value_type* samples = static_cast<value_type*>(
|
||||||
|
operator new(num_samples * sizeof(value_type)));
|
||||||
|
|
||||||
for (difference_type s = 0; s < num_samples; s++)
|
for (difference_type s = 0; s < num_samples; s++)
|
||||||
{
|
{
|
||||||
const unsigned long long index = static_cast<unsigned long long>(s)
|
const unsigned long long index = static_cast<unsigned long long>(s)
|
||||||
* n / num_samples;
|
* n / num_samples;
|
||||||
samples[s] = begin[index];
|
new(samples + s) value_type(begin[index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
__gnu_sequential::sort(samples, samples + num_samples, comp);
|
__gnu_sequential::sort(samples, samples + num_samples, comp);
|
||||||
|
|
||||||
value_type& pivot = samples[pivot_rank * num_samples / n];
|
value_type& pivot = samples[pivot_rank * num_samples / n];
|
||||||
|
|
||||||
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
|
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
|
||||||
|
pred(comp, pivot);
|
||||||
difference_type split = parallel_partition(begin, end, pred, num_threads);
|
difference_type split = parallel_partition(begin, end, pred, num_threads);
|
||||||
|
|
||||||
return split;
|
return split;
|
||||||
|
|
@ -93,7 +103,10 @@ namespace __gnu_parallel
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
|
parallel_sort_qs_conquer(RandomAccessIterator begin,
|
||||||
|
RandomAccessIterator end,
|
||||||
|
Comparator comp,
|
||||||
|
thread_index_t num_threads)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -101,8 +114,8 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
if (num_threads <= 1)
|
if (num_threads <= 1)
|
||||||
{
|
{
|
||||||
__gnu_sequential::sort(begin, end, comp);
|
__gnu_sequential::sort(begin, end, comp);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
difference_type n = end - begin, pivot_rank;
|
difference_type n = end - begin, pivot_rank;
|
||||||
|
|
@ -110,24 +123,27 @@ namespace __gnu_parallel
|
||||||
if (n <= 1)
|
if (n <= 1)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
thread_index_t num_processors_left;
|
thread_index_t num_threads_left;
|
||||||
|
|
||||||
if ((num_threads % 2) == 1)
|
if ((num_threads % 2) == 1)
|
||||||
num_processors_left = num_threads / 2 + 1;
|
num_threads_left = num_threads / 2 + 1;
|
||||||
else
|
else
|
||||||
num_processors_left = num_threads / 2;
|
num_threads_left = num_threads / 2;
|
||||||
|
|
||||||
pivot_rank = n * num_processors_left / num_threads;
|
pivot_rank = n * num_threads_left / num_threads;
|
||||||
|
|
||||||
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
|
difference_type split = parallel_sort_qs_divide(
|
||||||
Settings::sort_qs_num_samples_preset, num_threads);
|
begin, end, comp, pivot_rank,
|
||||||
|
Settings::sort_qs_num_samples_preset, num_threads);
|
||||||
|
|
||||||
#pragma omp parallel sections
|
#pragma omp parallel sections
|
||||||
{
|
{
|
||||||
#pragma omp section
|
#pragma omp section
|
||||||
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
|
parallel_sort_qs_conquer(begin, begin + split,
|
||||||
|
comp, num_threads_left);
|
||||||
#pragma omp section
|
#pragma omp section
|
||||||
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
|
parallel_sort_qs_conquer(begin + split, end,
|
||||||
|
comp, num_threads - num_threads_left);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Comparator>
|
template<typename RandomAccessIterator, typename Comparator>
|
||||||
inline void
|
inline void
|
||||||
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
|
parallel_sort_qs(
|
||||||
Comparator comp,
|
RandomAccessIterator begin,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
|
RandomAccessIterator end,
|
||||||
|
Comparator comp,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
|
||||||
|
int num_threads)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(n)
|
_GLIBCXX_CALL(n)
|
||||||
|
|
||||||
|
|
@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
|
||||||
// Hard to avoid.
|
// Hard to avoid.
|
||||||
omp_set_num_threads(num_threads);
|
omp_set_num_threads(num_threads);
|
||||||
|
|
||||||
bool old_nested = (omp_get_nested() != 0);
|
|
||||||
omp_set_nested(true);
|
|
||||||
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
|
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
|
||||||
omp_set_nested(old_nested);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} //namespace __gnu_parallel
|
} //namespace __gnu_parallel
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -45,16 +45,16 @@
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
/** @brief Type to hold the index of a bin.
|
/** @brief Type to hold the index of a bin.
|
||||||
*
|
*
|
||||||
* Since many variables of this type are allocated, it should be
|
* Since many variables of this type are allocated, it should be
|
||||||
* chosen as small as possible.
|
* chosen as small as possible.
|
||||||
*/
|
*/
|
||||||
typedef unsigned short bin_index;
|
typedef unsigned short bin_index;
|
||||||
|
|
||||||
/** @brief Data known to every thread participating in
|
/** @brief Data known to every thread participating in
|
||||||
__gnu_parallel::parallel_random_shuffle(). */
|
__gnu_parallel::parallel_random_shuffle(). */
|
||||||
template<typename RandomAccessIterator>
|
template<typename RandomAccessIterator>
|
||||||
struct DRandomShufflingGlobalData
|
struct DRandomShufflingGlobalData
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
|
|
@ -90,18 +90,15 @@ namespace __gnu_parallel
|
||||||
: source(_source) { }
|
: source(_source) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Local data for a thread participating in
|
/** @brief Local data for a thread participating in
|
||||||
__gnu_parallel::parallel_random_shuffle().
|
__gnu_parallel::parallel_random_shuffle().
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
||||||
struct DRSSorterPU
|
struct DRSSorterPU
|
||||||
{
|
{
|
||||||
/** @brief Number of threads participating in total. */
|
/** @brief Number of threads participating in total. */
|
||||||
int num_threads;
|
int num_threads;
|
||||||
|
|
||||||
/** @brief Number of owning thread. */
|
|
||||||
int iam;
|
|
||||||
|
|
||||||
/** @brief Begin index for bins taken care of by this thread. */
|
/** @brief Begin index for bins taken care of by this thread. */
|
||||||
bin_index bins_begin;
|
bin_index bins_begin;
|
||||||
|
|
||||||
|
|
@ -115,29 +112,29 @@ namespace __gnu_parallel
|
||||||
DRandomShufflingGlobalData<RandomAccessIterator>* sd;
|
DRandomShufflingGlobalData<RandomAccessIterator>* sd;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Generate a random number in @c [0,2^logp).
|
/** @brief Generate a random number in @c [0,2^logp).
|
||||||
* @param logp Logarithm (basis 2) of the upper range bound.
|
* @param logp Logarithm (basis 2) of the upper range bound.
|
||||||
* @param rng Random number generator to use.
|
* @param rng Random number generator to use.
|
||||||
*/
|
*/
|
||||||
template<typename RandomNumberGenerator>
|
template<typename RandomNumberGenerator>
|
||||||
inline int
|
inline int
|
||||||
random_number_pow2(int logp, RandomNumberGenerator& rng)
|
random_number_pow2(int logp, RandomNumberGenerator& rng)
|
||||||
{ return rng.genrand_bits(logp); }
|
{ return rng.genrand_bits(logp); }
|
||||||
|
|
||||||
/** @brief Random shuffle code executed by each thread.
|
/** @brief Random shuffle code executed by each thread.
|
||||||
* @param pus Array of thread-local data records. */
|
* @param pus Array of thread-local data records. */
|
||||||
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
||||||
inline void
|
inline void
|
||||||
parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
|
parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
|
||||||
RandomNumberGenerator>* pus)
|
RandomNumberGenerator>* pus)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
|
||||||
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
|
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
|
||||||
thread_index_t iam = d->iam;
|
|
||||||
|
|
||||||
// Indexing: dist[bin][processor]
|
// Indexing: dist[bin][processor]
|
||||||
difference_type length = sd->starts[iam + 1] - sd->starts[iam];
|
difference_type length = sd->starts[iam + 1] - sd->starts[iam];
|
||||||
|
|
@ -156,35 +153,35 @@ namespace __gnu_parallel
|
||||||
// First main loop.
|
// First main loop.
|
||||||
for (difference_type i = 0; i < length; i++)
|
for (difference_type i = 0; i < length; i++)
|
||||||
{
|
{
|
||||||
bin_index oracle = random_number_pow2(num_bits, rng);
|
bin_index oracle = random_number_pow2(num_bits, rng);
|
||||||
oracles[i] = oracle;
|
oracles[i] = oracle;
|
||||||
|
|
||||||
// To allow prefix (partial) sum.
|
// To allow prefix (partial) sum.
|
||||||
dist[oracle + 1]++;
|
dist[oracle + 1]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (bin_index b = 0; b < sd->num_bins + 1; b++)
|
for (bin_index b = 0; b < sd->num_bins + 1; b++)
|
||||||
sd->dist[b][iam + 1] = dist[b];
|
sd->dist[b][iam + 1] = dist[b];
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
#pragma omp single
|
# pragma omp single
|
||||||
{
|
{
|
||||||
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
|
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
|
||||||
// total number of items in bin s
|
// total number of items in bin s
|
||||||
for (bin_index s = 0; s < sd->num_bins; s++)
|
for (bin_index s = 0; s < sd->num_bins; s++)
|
||||||
__gnu_sequential::partial_sum(sd->dist[s + 1],
|
__gnu_sequential::partial_sum(sd->dist[s + 1],
|
||||||
sd->dist[s + 1] + d->num_threads + 1,
|
sd->dist[s + 1] + d->num_threads + 1,
|
||||||
sd->dist[s + 1]);
|
sd->dist[s + 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
sequence_index_t offset = 0, global_offset = 0;
|
sequence_index_t offset = 0, global_offset = 0;
|
||||||
for (bin_index s = 0; s < d->bins_begin; s++)
|
for (bin_index s = 0; s < d->bins_begin; s++)
|
||||||
global_offset += sd->dist[s + 1][d->num_threads];
|
global_offset += sd->dist[s + 1][d->num_threads];
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
for (bin_index s = d->bins_begin; s < d->bins_end; s++)
|
for (bin_index s = d->bins_begin; s < d->bins_end; s++)
|
||||||
{
|
{
|
||||||
|
|
@ -193,9 +190,10 @@ namespace __gnu_parallel
|
||||||
offset = sd->dist[s + 1][d->num_threads];
|
offset = sd->dist[s + 1][d->num_threads];
|
||||||
}
|
}
|
||||||
|
|
||||||
sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
|
sd->temporaries[iam] = static_cast<value_type*>(
|
||||||
|
::operator new(sizeof(value_type) * offset));
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
// Draw local copies to avoid false sharing.
|
// Draw local copies to avoid false sharing.
|
||||||
for (bin_index b = 0; b < sd->num_bins + 1; b++)
|
for (bin_index b = 0; b < sd->num_bins + 1; b++)
|
||||||
|
|
@ -211,11 +209,11 @@ namespace __gnu_parallel
|
||||||
// Distribute according to oracles, second main loop.
|
// Distribute according to oracles, second main loop.
|
||||||
for (difference_type i = 0; i < length; i++)
|
for (difference_type i = 0; i < length; i++)
|
||||||
{
|
{
|
||||||
bin_index target_bin = oracles[i];
|
bin_index target_bin = oracles[i];
|
||||||
thread_index_t target_p = bin_proc[target_bin];
|
thread_index_t target_p = bin_proc[target_bin];
|
||||||
|
|
||||||
// Last column [d->num_threads] stays unchanged.
|
// Last column [d->num_threads] stays unchanged.
|
||||||
temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
|
temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] oracles;
|
delete[] oracles;
|
||||||
|
|
@ -223,23 +221,27 @@ namespace __gnu_parallel
|
||||||
delete[] bin_proc;
|
delete[] bin_proc;
|
||||||
delete[] temporaries;
|
delete[] temporaries;
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
// Shuffle bins internally.
|
// Shuffle bins internally.
|
||||||
for (bin_index b = d->bins_begin; b < d->bins_end; b++)
|
for (bin_index b = d->bins_begin; b < d->bins_end; b++)
|
||||||
{
|
{
|
||||||
value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
|
value_type* begin =
|
||||||
* end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
|
sd->temporaries[iam] +
|
||||||
sequential_random_shuffle(begin, end, rng);
|
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
|
||||||
std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
|
* end =
|
||||||
|
sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
|
||||||
|
sequential_random_shuffle(begin, end, rng);
|
||||||
|
std::copy(begin, end, sd->source + global_offset +
|
||||||
|
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] sd->temporaries[iam];
|
delete[] sd->temporaries[iam];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Round up to the next greater power of 2.
|
/** @brief Round up to the next greater power of 2.
|
||||||
* @param x Integer to round up */
|
* @param x Integer to round up */
|
||||||
template<typename T>
|
template<typename T>
|
||||||
T
|
T
|
||||||
round_up_to_pow2(T x)
|
round_up_to_pow2(T x)
|
||||||
{
|
{
|
||||||
|
|
@ -249,16 +251,21 @@ namespace __gnu_parallel
|
||||||
return (T)1 << (log2(x - 1) + 1);
|
return (T)1 << (log2(x - 1) + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Main parallel random shuffle step.
|
/** @brief Main parallel random shuffle step.
|
||||||
* @param begin Begin iterator of sequence.
|
* @param begin Begin iterator of sequence.
|
||||||
* @param end End iterator of sequence.
|
* @param end End iterator of sequence.
|
||||||
* @param n Length of sequence.
|
* @param n Length of sequence.
|
||||||
* @param num_threads Number of threads to use.
|
* @param num_threads Number of threads to use.
|
||||||
* @param rng Random number generator to use.
|
* @param rng Random number generator to use.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
||||||
inline void
|
inline void
|
||||||
parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
|
parallel_random_shuffle_drs(
|
||||||
|
RandomAccessIterator begin,
|
||||||
|
RandomAccessIterator end,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
|
||||||
|
thread_index_t num_threads,
|
||||||
|
RandomNumberGenerator& rng)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -275,87 +282,99 @@ namespace __gnu_parallel
|
||||||
// Try the L1 cache first.
|
// Try the L1 cache first.
|
||||||
|
|
||||||
// Must fit into L1.
|
// Must fit into L1.
|
||||||
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
|
num_bins_cache = std::max<difference_type>(
|
||||||
|
1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
|
||||||
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
||||||
|
|
||||||
// No more buckets than TLB entries, power of 2
|
// No more buckets than TLB entries, power of 2
|
||||||
// Power of 2 and at least one element per bin, at most the TLB size.
|
// Power of 2 and at least one element per bin, at most the TLB size.
|
||||||
num_bins = std::min(n, (difference_type)num_bins_cache);
|
num_bins = std::min<difference_type>(n, num_bins_cache);
|
||||||
|
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
||||||
// 2 TLB entries needed per bin.
|
// 2 TLB entries needed per bin.
|
||||||
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
|
num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
|
||||||
#endif
|
#endif
|
||||||
num_bins = round_up_to_pow2(num_bins);
|
num_bins = round_up_to_pow2(num_bins);
|
||||||
|
|
||||||
if (num_bins < num_bins_cache)
|
if (num_bins < num_bins_cache)
|
||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
// Now try the L2 cache
|
// Now try the L2 cache
|
||||||
// Must fit into L2
|
// Must fit into L2
|
||||||
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
|
num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
|
||||||
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
1, n / (Settings::L2_cache_size / sizeof(value_type))));
|
||||||
|
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
||||||
|
|
||||||
// No more buckets than TLB entries, power of 2.
|
// No more buckets than TLB entries, power of 2.
|
||||||
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
|
num_bins = static_cast<bin_index>(
|
||||||
// Power of 2 and at least one element per bin, at most the TLB size.
|
std::min(n, static_cast<difference_type>(num_bins_cache)));
|
||||||
|
// Power of 2 and at least one element per bin, at most the TLB size.
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
||||||
// 2 TLB entries needed per bin.
|
// 2 TLB entries needed per bin.
|
||||||
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
|
num_bins = std::min(
|
||||||
|
static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
|
||||||
#endif
|
#endif
|
||||||
num_bins = round_up_to_pow2(num_bins);
|
num_bins = round_up_to_pow2(num_bins);
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
|
num_threads = std::min<bin_index>(num_threads, num_bins);
|
||||||
|
|
||||||
if (num_threads <= 1)
|
if (num_threads <= 1)
|
||||||
return sequential_random_shuffle(begin, end, rng);
|
return sequential_random_shuffle(begin, end, rng);
|
||||||
|
|
||||||
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
|
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
|
||||||
|
DRSSorterPU<RandomAccessIterator, random_number >* pus;
|
||||||
|
difference_type* starts;
|
||||||
|
|
||||||
DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
|
||||||
sd.temporaries = new value_type*[num_threads];
|
|
||||||
//sd.oracles = new bin_index[n];
|
|
||||||
sd.dist = new difference_type*[num_bins + 1];
|
|
||||||
sd.bin_proc = new thread_index_t[num_bins];
|
|
||||||
for (bin_index b = 0; b < num_bins + 1; b++)
|
|
||||||
sd.dist[b] = new difference_type[num_threads + 1];
|
|
||||||
for (bin_index b = 0; b < (num_bins + 1); b++)
|
|
||||||
{
|
{
|
||||||
sd.dist[0][0] = 0;
|
# pragma omp single
|
||||||
sd.dist[b][0] = 0;
|
{
|
||||||
}
|
pus = new DRSSorterPU<RandomAccessIterator, random_number>
|
||||||
difference_type* starts = sd.starts = new difference_type[num_threads + 1];
|
[num_threads];
|
||||||
int bin_cursor = 0;
|
|
||||||
sd.num_bins = num_bins;
|
|
||||||
sd.num_bits = log2(num_bins);
|
|
||||||
|
|
||||||
difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
|
sd.temporaries = new value_type*[num_threads];
|
||||||
int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
|
sd.dist = new difference_type*[num_bins + 1];
|
||||||
for (int i = 0; i < num_threads; i++)
|
sd.bin_proc = new thread_index_t[num_bins];
|
||||||
{
|
for (bin_index b = 0; b < num_bins + 1; b++)
|
||||||
starts[i] = start;
|
sd.dist[b] = new difference_type[num_threads + 1];
|
||||||
start += (i < split) ? (chunk_length + 1) : chunk_length;
|
for (bin_index b = 0; b < (num_bins + 1); b++)
|
||||||
int j = pus[i].bins_begin = bin_cursor;
|
{
|
||||||
|
sd.dist[0][0] = 0;
|
||||||
|
sd.dist[b][0] = 0;
|
||||||
|
}
|
||||||
|
starts = sd.starts = new difference_type[num_threads + 1];
|
||||||
|
int bin_cursor = 0;
|
||||||
|
sd.num_bins = num_bins;
|
||||||
|
sd.num_bits = log2(num_bins);
|
||||||
|
|
||||||
// Range of bins for this processor.
|
difference_type chunk_length = n / num_threads,
|
||||||
bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
|
split = n % num_threads, start = 0;
|
||||||
pus[i].bins_end = bin_cursor;
|
difference_type bin_chunk_length = num_bins / num_threads,
|
||||||
for (; j < bin_cursor; j++)
|
bin_split = num_bins % num_threads;
|
||||||
sd.bin_proc[j] = i;
|
for (thread_index_t i = 0; i < num_threads; i++)
|
||||||
pus[i].num_threads = num_threads;
|
{
|
||||||
pus[i].iam = i;
|
starts[i] = start;
|
||||||
pus[i].seed = rng(std::numeric_limits<uint32>::max());
|
start += (i < split) ? (chunk_length + 1) : chunk_length;
|
||||||
pus[i].sd = &sd;
|
int j = pus[i].bins_begin = bin_cursor;
|
||||||
}
|
|
||||||
starts[num_threads] = start;
|
|
||||||
|
|
||||||
// Now shuffle in parallel.
|
// Range of bins for this processor.
|
||||||
#pragma omp parallel num_threads(num_threads)
|
bin_cursor += (i < bin_split) ?
|
||||||
parallel_random_shuffle_drs_pu(pus);
|
(bin_chunk_length + 1) : bin_chunk_length;
|
||||||
|
pus[i].bins_end = bin_cursor;
|
||||||
|
for (; j < bin_cursor; j++)
|
||||||
|
sd.bin_proc[j] = i;
|
||||||
|
pus[i].num_threads = num_threads;
|
||||||
|
pus[i].seed = rng(std::numeric_limits<uint32>::max());
|
||||||
|
pus[i].sd = &sd;
|
||||||
|
}
|
||||||
|
starts[num_threads] = start;
|
||||||
|
} //single
|
||||||
|
// Now shuffle in parallel.
|
||||||
|
parallel_random_shuffle_drs_pu(pus);
|
||||||
|
}
|
||||||
|
|
||||||
delete[] starts;
|
delete[] starts;
|
||||||
delete[] sd.bin_proc;
|
delete[] sd.bin_proc;
|
||||||
|
|
@ -367,16 +386,16 @@ namespace __gnu_parallel
|
||||||
delete[] pus;
|
delete[] pus;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Sequential cache-efficient random shuffle.
|
/** @brief Sequential cache-efficient random shuffle.
|
||||||
* @param begin Begin iterator of sequence.
|
* @param begin Begin iterator of sequence.
|
||||||
* @param end End iterator of sequence.
|
* @param end End iterator of sequence.
|
||||||
* @param rng Random number generator to use.
|
* @param rng Random number generator to use.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
||||||
inline void
|
inline void
|
||||||
sequential_random_shuffle(RandomAccessIterator begin,
|
sequential_random_shuffle(RandomAccessIterator begin,
|
||||||
RandomAccessIterator end,
|
RandomAccessIterator end,
|
||||||
RandomNumberGenerator& rng)
|
RandomNumberGenerator& rng)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
@ -388,7 +407,9 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
||||||
// Try the L1 cache first, must fit into L1.
|
// Try the L1 cache first, must fit into L1.
|
||||||
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
|
num_bins_cache =
|
||||||
|
std::max<difference_type>
|
||||||
|
(1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
|
||||||
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
||||||
|
|
||||||
// No more buckets than TLB entries, power of 2
|
// No more buckets than TLB entries, power of 2
|
||||||
|
|
@ -403,19 +424,23 @@ namespace __gnu_parallel
|
||||||
if (num_bins < num_bins_cache)
|
if (num_bins < num_bins_cache)
|
||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
// Now try the L2 cache, must fit into L2.
|
// Now try the L2 cache, must fit into L2.
|
||||||
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
|
num_bins_cache =
|
||||||
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
static_cast<bin_index>(std::max<difference_type>(
|
||||||
|
1, n / (Settings::L2_cache_size / sizeof(value_type))));
|
||||||
|
num_bins_cache = round_up_to_pow2(num_bins_cache);
|
||||||
|
|
||||||
// No more buckets than TLB entries, power of 2
|
// No more buckets than TLB entries, power of 2
|
||||||
// Power of 2 and at least one element per bin, at most the TLB size.
|
// Power of 2 and at least one element per bin, at most the TLB size.
|
||||||
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
|
num_bins = static_cast<bin_index>
|
||||||
|
(std::min(n, static_cast<difference_type>(num_bins_cache)));
|
||||||
|
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
|
||||||
// 2 TLB entries needed per bin
|
// 2 TLB entries needed per bin
|
||||||
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
|
num_bins =
|
||||||
|
std::min<difference_type>(Settings::TLB_size / 2, num_bins);
|
||||||
#endif
|
#endif
|
||||||
num_bins = round_up_to_pow2(num_bins);
|
num_bins = round_up_to_pow2(num_bins);
|
||||||
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -424,58 +449,62 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
if (num_bins > 1)
|
if (num_bins > 1)
|
||||||
{
|
{
|
||||||
value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n));
|
value_type* target = static_cast<value_type*>(
|
||||||
bin_index* oracles = new bin_index[n];
|
::operator new(sizeof(value_type) * n));
|
||||||
difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
|
bin_index* oracles = new bin_index[n];
|
||||||
|
difference_type* dist0 = new difference_type[num_bins + 1],
|
||||||
|
* dist1 = new difference_type[num_bins + 1];
|
||||||
|
|
||||||
for (int b = 0; b < num_bins + 1; b++)
|
for (int b = 0; b < num_bins + 1; b++)
|
||||||
dist0[b] = 0;
|
dist0[b] = 0;
|
||||||
|
|
||||||
random_number bitrng(rng(0xFFFFFFFF));
|
random_number bitrng(rng(0xFFFFFFFF));
|
||||||
|
|
||||||
for (difference_type i = 0; i < n; i++)
|
for (difference_type i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
bin_index oracle = random_number_pow2(num_bits, bitrng);
|
bin_index oracle = random_number_pow2(num_bits, bitrng);
|
||||||
oracles[i] = oracle;
|
oracles[i] = oracle;
|
||||||
|
|
||||||
// To allow prefix (partial) sum.
|
// To allow prefix (partial) sum.
|
||||||
dist0[oracle + 1]++;
|
dist0[oracle + 1]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sum up bins.
|
// Sum up bins.
|
||||||
__gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
|
__gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
|
||||||
|
|
||||||
for (int b = 0; b < num_bins + 1; b++)
|
for (int b = 0; b < num_bins + 1; b++)
|
||||||
dist1[b] = dist0[b];
|
dist1[b] = dist0[b];
|
||||||
|
|
||||||
// Distribute according to oracles.
|
// Distribute according to oracles.
|
||||||
for (difference_type i = 0; i < n; i++)
|
for (difference_type i = 0; i < n; i++)
|
||||||
target[(dist0[oracles[i]])++] = *(begin + i);
|
target[(dist0[oracles[i]])++] = *(begin + i);
|
||||||
|
|
||||||
for (int b = 0; b < num_bins; b++)
|
for (int b = 0; b < num_bins; b++)
|
||||||
{
|
{
|
||||||
sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
|
sequential_random_shuffle(target + dist1[b],
|
||||||
rng);
|
target + dist1[b + 1],
|
||||||
}
|
rng);
|
||||||
|
}
|
||||||
|
|
||||||
delete[] dist0;
|
delete[] dist0;
|
||||||
delete[] dist1;
|
delete[] dist1;
|
||||||
delete[] oracles;
|
delete[] oracles;
|
||||||
delete[] target;
|
delete[] target;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
__gnu_sequential::random_shuffle(begin, end, rng);
|
__gnu_sequential::random_shuffle(begin, end, rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Parallel random public call.
|
/** @brief Parallel random public call.
|
||||||
* @param begin Begin iterator of sequence.
|
* @param begin Begin iterator of sequence.
|
||||||
* @param end End iterator of sequence.
|
* @param end End iterator of sequence.
|
||||||
* @param rng Random number generator to use.
|
* @param rng Random number generator to use.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
template<typename RandomAccessIterator, typename RandomNumberGenerator>
|
||||||
inline void
|
inline void
|
||||||
parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end,
|
parallel_random_shuffle(RandomAccessIterator begin,
|
||||||
RandomNumberGenerator rng = random_number())
|
RandomAccessIterator end,
|
||||||
|
RandomNumberGenerator rng = random_number())
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
typedef std::iterator_traits<RandomAccessIterator> traits_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
|
||||||
|
|
@ -53,10 +53,10 @@ namespace __gnu_parallel
|
||||||
* @param length Length of sequence to search for.
|
* @param length Length of sequence to search for.
|
||||||
* @param advances Returned offsets.
|
* @param advances Returned offsets.
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename _DifferenceTp>
|
template<typename RandomAccessIterator, typename _DifferenceTp>
|
||||||
void
|
void
|
||||||
calc_borders(RandomAccessIterator elements, _DifferenceTp length,
|
calc_borders(RandomAccessIterator elements, _DifferenceTp length,
|
||||||
_DifferenceTp* off)
|
_DifferenceTp* off)
|
||||||
{
|
{
|
||||||
typedef _DifferenceTp difference_type;
|
typedef _DifferenceTp difference_type;
|
||||||
|
|
||||||
|
|
@ -66,9 +66,9 @@ namespace __gnu_parallel
|
||||||
difference_type k = 0;
|
difference_type k = 0;
|
||||||
for (difference_type j = 2; j <= length; j++)
|
for (difference_type j = 2; j <= length; j++)
|
||||||
{
|
{
|
||||||
while ((k >= 0) && !(elements[k] == elements[j-1]))
|
while ((k >= 0) && !(elements[k] == elements[j-1]))
|
||||||
k = off[k];
|
k = off[k];
|
||||||
off[j] = ++k;
|
off[j] = ++k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -81,11 +81,14 @@ namespace __gnu_parallel
|
||||||
* @param end2 End iterator of second sequence.
|
* @param end2 End iterator of second sequence.
|
||||||
* @param pred Find predicate.
|
* @param pred Find predicate.
|
||||||
* @return Place of finding in first sequences. */
|
* @return Place of finding in first sequences. */
|
||||||
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred>
|
template<
|
||||||
|
typename _RandomAccessIterator1,
|
||||||
|
typename _RandomAccessIterator2,
|
||||||
|
typename Pred>
|
||||||
_RandomAccessIterator1
|
_RandomAccessIterator1
|
||||||
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
|
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
|
||||||
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
|
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
|
||||||
Pred pred)
|
Pred pred)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
|
typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
@ -103,60 +106,71 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
// Where is first occurrence of pattern? defaults to end.
|
// Where is first occurrence of pattern? defaults to end.
|
||||||
difference_type result = (end1 - begin1);
|
difference_type result = (end1 - begin1);
|
||||||
|
difference_type *splitters;
|
||||||
|
|
||||||
// Pattern too long.
|
// Pattern too long.
|
||||||
if (input_length < 0)
|
if (input_length < 0)
|
||||||
return end1;
|
return end1;
|
||||||
|
|
||||||
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
|
|
||||||
|
|
||||||
omp_lock_t result_lock;
|
omp_lock_t result_lock;
|
||||||
omp_init_lock(&result_lock);
|
omp_init_lock(&result_lock);
|
||||||
|
|
||||||
difference_type borders[num_threads + 1];
|
thread_index_t num_threads =
|
||||||
__gnu_parallel::equally_split(input_length, num_threads, borders);
|
std::max<difference_type>(1,
|
||||||
|
std::min<difference_type>(input_length, get_max_threads()));
|
||||||
|
|
||||||
difference_type advances[pattern_length];
|
difference_type advances[pattern_length];
|
||||||
calc_borders(begin2, pattern_length, advances);
|
calc_borders(begin2, pattern_length, advances);
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
# pragma omp parallel num_threads(num_threads)
|
||||||
{
|
{
|
||||||
thread_index_t iam = omp_get_thread_num();
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
splitters = new difference_type[num_threads + 1];
|
||||||
|
equally_split(input_length, num_threads, splitters);
|
||||||
|
}
|
||||||
|
|
||||||
difference_type start = borders[iam], stop = borders[iam + 1];
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
difference_type pos_in_pattern = 0;
|
difference_type start = splitters[iam], stop = splitters[iam + 1];
|
||||||
bool found_pattern = false;
|
|
||||||
|
|
||||||
while (start <= stop && !found_pattern)
|
difference_type pos_in_pattern = 0;
|
||||||
{
|
bool found_pattern = false;
|
||||||
// Get new value of result.
|
|
||||||
#pragma omp flush(result)
|
|
||||||
// No chance for this thread to find first occurrence.
|
|
||||||
if (result < start)
|
|
||||||
break;
|
|
||||||
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern]))
|
|
||||||
{
|
|
||||||
++pos_in_pattern;
|
|
||||||
if (pos_in_pattern == pattern_length)
|
|
||||||
{
|
|
||||||
// Found new candidate for result.
|
|
||||||
omp_set_lock(&result_lock);
|
|
||||||
result = std::min(result, start);
|
|
||||||
omp_unset_lock(&result_lock);
|
|
||||||
|
|
||||||
found_pattern = true;
|
while (start <= stop && !found_pattern)
|
||||||
break;
|
{
|
||||||
}
|
// Get new value of result.
|
||||||
}
|
#pragma omp flush(result)
|
||||||
// Make safe jump.
|
// No chance for this thread to find first occurrence.
|
||||||
start += (pos_in_pattern - advances[pos_in_pattern]);
|
if (result < start)
|
||||||
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
|
break;
|
||||||
}
|
while (pred(begin1[start + pos_in_pattern],
|
||||||
}
|
begin2[pos_in_pattern]))
|
||||||
|
{
|
||||||
|
++pos_in_pattern;
|
||||||
|
if (pos_in_pattern == pattern_length)
|
||||||
|
{
|
||||||
|
// Found new candidate for result.
|
||||||
|
omp_set_lock(&result_lock);
|
||||||
|
result = std::min(result, start);
|
||||||
|
omp_unset_lock(&result_lock);
|
||||||
|
|
||||||
|
found_pattern = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Make safe jump.
|
||||||
|
start += (pos_in_pattern - advances[pos_in_pattern]);
|
||||||
|
pos_in_pattern =
|
||||||
|
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
|
||||||
|
}
|
||||||
|
} //parallel
|
||||||
|
|
||||||
omp_destroy_lock(&result_lock);
|
omp_destroy_lock(&result_lock);
|
||||||
|
|
||||||
|
delete[] splitters;
|
||||||
|
|
||||||
// Return iterator on found element.
|
// Return iterator on found element.
|
||||||
return (begin1 + result);
|
return (begin1 + result);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,28 +47,31 @@
|
||||||
|
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
template<typename InputIterator, typename OutputIterator>
|
template<typename InputIterator, typename OutputIterator>
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
copy_tail(std::pair<InputIterator, InputIterator> b,
|
copy_tail(std::pair<InputIterator, InputIterator> b,
|
||||||
std::pair<InputIterator, InputIterator> e, OutputIterator r)
|
std::pair<InputIterator, InputIterator> e, OutputIterator r)
|
||||||
{
|
{
|
||||||
if (b.first != e.first)
|
if (b.first != e.first)
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
*r++ = *b.first++;
|
*r++ = *b.first++;
|
||||||
}
|
}
|
||||||
while (b.first != e.first);
|
while (b.first != e.first);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
while (b.second != e.second)
|
while (b.second != e.second)
|
||||||
*r++ = *b.second++;
|
*r++ = *b.second++;
|
||||||
}
|
}
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
struct symmetric_difference_func
|
struct symmetric_difference_func
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
|
|
@ -80,55 +83,56 @@ namespace __gnu_parallel
|
||||||
Comparator comp;
|
Comparator comp;
|
||||||
|
|
||||||
inline OutputIterator invoke(InputIterator a, InputIterator b,
|
inline OutputIterator invoke(InputIterator a, InputIterator b,
|
||||||
InputIterator c, InputIterator d,
|
InputIterator c, InputIterator d,
|
||||||
OutputIterator r) const
|
OutputIterator r) const
|
||||||
{
|
{
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{
|
{
|
||||||
*r = *a;
|
*r = *a;
|
||||||
++a;
|
++a;
|
||||||
++r;
|
++r;
|
||||||
}
|
}
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{
|
{
|
||||||
*r = *c;
|
*r = *c;
|
||||||
++c;
|
++c;
|
||||||
++r;
|
++r;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return std::copy(c, d, std::copy(a, b, r));
|
return std::copy(c, d, std::copy(a, b, r));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline difference_type
|
inline difference_type
|
||||||
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
|
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
|
||||||
|
const
|
||||||
{
|
{
|
||||||
difference_type counter = 0;
|
difference_type counter = 0;
|
||||||
|
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++counter;
|
++counter;
|
||||||
}
|
}
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{
|
{
|
||||||
++c;
|
++c;
|
||||||
++counter;
|
++counter;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return counter + (b - a) + (d - c);
|
return counter + (b - a) + (d - c);
|
||||||
}
|
}
|
||||||
|
|
@ -144,7 +148,10 @@ namespace __gnu_parallel
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
struct difference_func
|
struct difference_func
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
|
|
@ -157,44 +164,45 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
|
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
|
||||||
OutputIterator r) const
|
OutputIterator r) const
|
||||||
{
|
{
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{
|
{
|
||||||
*r = *a;
|
*r = *a;
|
||||||
++a;
|
++a;
|
||||||
++r;
|
++r;
|
||||||
}
|
}
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{ ++c; }
|
{ ++c; }
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return std::copy(a, b, r);
|
return std::copy(a, b, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline difference_type
|
inline difference_type
|
||||||
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
|
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
|
||||||
|
const
|
||||||
{
|
{
|
||||||
difference_type counter = 0;
|
difference_type counter = 0;
|
||||||
|
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++counter;
|
++counter;
|
||||||
}
|
}
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{ ++c; }
|
{ ++c; }
|
||||||
else
|
else
|
||||||
{ ++a; ++c; }
|
{ ++a; ++c; }
|
||||||
}
|
}
|
||||||
|
|
||||||
return counter + (b - a);
|
return counter + (b - a);
|
||||||
}
|
}
|
||||||
|
|
@ -209,7 +217,10 @@ namespace __gnu_parallel
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
struct intersection_func
|
struct intersection_func
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
|
|
@ -222,44 +233,45 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
|
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
|
||||||
OutputIterator r) const
|
OutputIterator r) const
|
||||||
{
|
{
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{ ++a; }
|
{ ++a; }
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{ ++c; }
|
{ ++c; }
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
*r = *a;
|
*r = *a;
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
++r;
|
++r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline difference_type
|
inline difference_type
|
||||||
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const
|
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
|
||||||
|
const
|
||||||
{
|
{
|
||||||
difference_type counter = 0;
|
difference_type counter = 0;
|
||||||
|
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{ ++a; }
|
{ ++a; }
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{ ++c; }
|
{ ++c; }
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
++counter;
|
++counter;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
@ -273,10 +285,11 @@ namespace __gnu_parallel
|
||||||
{ return out; }
|
{ return out; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class InputIterator, class OutputIterator, class Comparator>
|
template<class InputIterator, class OutputIterator, class Comparator>
|
||||||
struct union_func
|
struct union_func
|
||||||
{
|
{
|
||||||
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
|
typedef typename std::iterator_traits<InputIterator>::difference_type
|
||||||
|
difference_type;
|
||||||
|
|
||||||
union_func(Comparator c) : comp(c) {}
|
union_func(Comparator c) : comp(c) {}
|
||||||
|
|
||||||
|
|
@ -284,50 +297,50 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
invoke(InputIterator a, const InputIterator b, InputIterator c,
|
invoke(InputIterator a, const InputIterator b, InputIterator c,
|
||||||
const InputIterator d, OutputIterator r) const
|
const InputIterator d, OutputIterator r) const
|
||||||
{
|
{
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{
|
{
|
||||||
*r = *a;
|
*r = *a;
|
||||||
++a;
|
++a;
|
||||||
}
|
}
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{
|
{
|
||||||
*r = *c;
|
*r = *c;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
*r = *a;
|
*r = *a;
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
++r;
|
++r;
|
||||||
}
|
}
|
||||||
return std::copy(c, d, std::copy(a, b, r));
|
return std::copy(c, d, std::copy(a, b, r));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline difference_type
|
inline difference_type
|
||||||
count(InputIterator a, const InputIterator b, InputIterator c,
|
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
|
||||||
const InputIterator d) const
|
const
|
||||||
{
|
{
|
||||||
difference_type counter = 0;
|
difference_type counter = 0;
|
||||||
|
|
||||||
while (a != b && c != d)
|
while (a != b && c != d)
|
||||||
{
|
{
|
||||||
if (comp(*a, *c))
|
if (comp(*a, *c))
|
||||||
{ ++a; }
|
{ ++a; }
|
||||||
else if (comp(*c, *a))
|
else if (comp(*c, *a))
|
||||||
{ ++c; }
|
{ ++c; }
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++a;
|
++a;
|
||||||
++c;
|
++c;
|
||||||
}
|
}
|
||||||
++counter;
|
++counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
counter += (b - a);
|
counter += (b - a);
|
||||||
counter += (d - c);
|
counter += (d - c);
|
||||||
|
|
@ -343,11 +356,14 @@ namespace __gnu_parallel
|
||||||
{ return std::copy(a, b, out); }
|
{ return std::copy(a, b, out); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Operation>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Operation>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_set_operation(InputIterator begin1, InputIterator end1,
|
parallel_set_operation(InputIterator begin1, InputIterator end1,
|
||||||
InputIterator begin2, InputIterator end2,
|
InputIterator begin2, InputIterator end2,
|
||||||
OutputIterator result, Operation op)
|
OutputIterator result, Operation op)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL((end1 - begin1) + (end2 - begin2))
|
_GLIBCXX_CALL((end1 - begin1) + (end2 - begin2))
|
||||||
|
|
||||||
|
|
@ -355,7 +371,6 @@ namespace __gnu_parallel
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
typedef typename std::pair<InputIterator, InputIterator> iterator_pair;
|
typedef typename std::pair<InputIterator, InputIterator> iterator_pair;
|
||||||
|
|
||||||
|
|
||||||
if (begin1 == end1)
|
if (begin1 == end1)
|
||||||
return op.first_empty(begin2, end2, result);
|
return op.first_empty(begin2, end2, result);
|
||||||
|
|
||||||
|
|
@ -364,152 +379,174 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
const difference_type size = (end1 - begin1) + (end2 - begin2);
|
const difference_type size = (end1 - begin1) + (end2 - begin2);
|
||||||
|
|
||||||
thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads());
|
const iterator_pair sequence[ 2 ] =
|
||||||
|
{ std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
|
||||||
difference_type borders[num_threads + 2];
|
|
||||||
equally_split(size, num_threads + 1, borders);
|
|
||||||
|
|
||||||
const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
|
|
||||||
|
|
||||||
iterator_pair block_begins[num_threads + 1];
|
|
||||||
|
|
||||||
// Very start.
|
|
||||||
block_begins[0] = std::make_pair(begin1, begin2);
|
|
||||||
difference_type length[num_threads];
|
|
||||||
|
|
||||||
OutputIterator return_value = result;
|
OutputIterator return_value = result;
|
||||||
|
difference_type *borders;
|
||||||
|
iterator_pair *block_begins;
|
||||||
|
difference_type* lengths;
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_threads)
|
thread_index_t num_threads =
|
||||||
{
|
std::min<difference_type>(get_max_threads(),
|
||||||
// Result from multiseq_partition.
|
std::min(end1 - begin1, end2 - begin2));
|
||||||
InputIterator offset[2];
|
|
||||||
const int iam = omp_get_thread_num();
|
|
||||||
|
|
||||||
const difference_type rank = borders[iam + 1];
|
# pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
|
||||||
multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
|
borders = new difference_type[num_threads + 2];
|
||||||
|
equally_split(size, num_threads + 1, borders);
|
||||||
|
block_begins = new iterator_pair[num_threads + 1];
|
||||||
|
// Very start.
|
||||||
|
block_begins[0] = std::make_pair(begin1, begin2);
|
||||||
|
lengths = new difference_type[num_threads];
|
||||||
|
} //single
|
||||||
|
|
||||||
// allowed to read?
|
thread_index_t iam = omp_get_thread_num();
|
||||||
// together
|
|
||||||
// *(offset[ 0 ] - 1) == *offset[ 1 ]
|
|
||||||
if (offset[ 0 ] != begin1 && offset[ 1 ] != end2
|
|
||||||
&& !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ])
|
|
||||||
&& !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1)))
|
|
||||||
{
|
|
||||||
// Avoid split between globally equal elements: move one to
|
|
||||||
// front in first sequence.
|
|
||||||
--offset[ 0 ];
|
|
||||||
}
|
|
||||||
|
|
||||||
iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]);
|
// Result from multiseq_partition.
|
||||||
|
InputIterator offset[2];
|
||||||
|
const difference_type rank = borders[iam + 1];
|
||||||
|
|
||||||
// Make sure all threads have their block_begin result written out.
|
multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
|
||||||
#pragma omp barrier
|
|
||||||
|
|
||||||
iterator_pair block_begin = block_begins[ iam ];
|
// allowed to read?
|
||||||
|
// together
|
||||||
|
// *(offset[ 0 ] - 1) == *offset[ 1 ]
|
||||||
|
if (offset[ 0 ] != begin1 && offset[ 1 ] != end2
|
||||||
|
&& !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ])
|
||||||
|
&& !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1)))
|
||||||
|
{
|
||||||
|
// Avoid split between globally equal elements: move one to
|
||||||
|
// front in first sequence.
|
||||||
|
--offset[ 0 ];
|
||||||
|
}
|
||||||
|
|
||||||
// Begin working for the first block, while the others except
|
iterator_pair block_end = block_begins[ iam + 1 ] =
|
||||||
// the last start to count.
|
iterator_pair(offset[ 0 ], offset[ 1 ]);
|
||||||
if (iam == 0)
|
|
||||||
{
|
|
||||||
// The first thread can copy already.
|
|
||||||
length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
length[ iam ] = op.count(block_begin.first, block_end.first,
|
|
||||||
block_begin.second, block_end.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure everyone wrote their lengths.
|
// Make sure all threads have their block_begin result written out.
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
OutputIterator r = result;
|
iterator_pair block_begin = block_begins[ iam ];
|
||||||
|
|
||||||
if (iam == 0)
|
// Begin working for the first block, while the others except
|
||||||
{
|
// the last start to count.
|
||||||
// Do the last block.
|
if (iam == 0)
|
||||||
for (int i = 0; i < num_threads; ++i)
|
{
|
||||||
r += length[i];
|
// The first thread can copy already.
|
||||||
|
lengths[ iam ] = op.invoke(block_begin.first, block_end.first,
|
||||||
|
block_begin.second, block_end.second,
|
||||||
|
result)
|
||||||
|
- result;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lengths[ iam ] = op.count(block_begin.first, block_end.first,
|
||||||
|
block_begin.second, block_end.second);
|
||||||
|
}
|
||||||
|
|
||||||
block_begin = block_begins[num_threads];
|
// Make sure everyone wrote their lengths.
|
||||||
|
# pragma omp barrier
|
||||||
|
|
||||||
// Return the result iterator of the last block.
|
OutputIterator r = result;
|
||||||
return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r);
|
|
||||||
|
|
||||||
}
|
if (iam == 0)
|
||||||
else
|
{
|
||||||
{
|
// Do the last block.
|
||||||
for (int i = 0; i < iam; ++i)
|
for (int i = 0; i < num_threads; ++i)
|
||||||
r += length[ i ];
|
r += lengths[i];
|
||||||
|
|
||||||
// Reset begins for copy pass.
|
block_begin = block_begins[num_threads];
|
||||||
op.invoke(block_begin.first, block_end.first,
|
|
||||||
block_begin.second, block_end.second, r);
|
// Return the result iterator of the last block.
|
||||||
}
|
return_value = op.invoke(
|
||||||
}
|
block_begin.first, end1, block_begin.second, end2, r);
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < iam; ++i)
|
||||||
|
r += lengths[ i ];
|
||||||
|
|
||||||
|
// Reset begins for copy pass.
|
||||||
|
op.invoke(block_begin.first, block_end.first,
|
||||||
|
block_begin.second, block_end.second, r);
|
||||||
|
}
|
||||||
|
}
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_set_union(InputIterator begin1, InputIterator end1,
|
parallel_set_union(InputIterator begin1, InputIterator end1,
|
||||||
InputIterator begin2, InputIterator end2,
|
InputIterator begin2, InputIterator end2,
|
||||||
OutputIterator result, Comparator comp)
|
OutputIterator result, Comparator comp)
|
||||||
{
|
{
|
||||||
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
||||||
union_func< InputIterator, OutputIterator, Comparator>(comp));
|
union_func< InputIterator, OutputIterator, Comparator>(comp));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_set_intersection(InputIterator begin1, InputIterator end1,
|
parallel_set_intersection(InputIterator begin1, InputIterator end1,
|
||||||
InputIterator begin2, InputIterator end2,
|
InputIterator begin2, InputIterator end2,
|
||||||
OutputIterator result, Comparator comp)
|
OutputIterator result, Comparator comp)
|
||||||
{
|
{
|
||||||
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
||||||
intersection_func<InputIterator, OutputIterator, Comparator>(comp));
|
intersection_func<InputIterator, OutputIterator, Comparator>(comp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator>
|
template<typename InputIterator, typename OutputIterator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result)
|
set_intersection(InputIterator begin1, InputIterator end1,
|
||||||
|
InputIterator begin2, InputIterator end2,
|
||||||
|
OutputIterator result)
|
||||||
{
|
{
|
||||||
typedef std::iterator_traits<InputIterator> traits_type;
|
typedef std::iterator_traits<InputIterator> traits_type;
|
||||||
typedef typename traits_type::value_type value_type;
|
typedef typename traits_type::value_type value_type;
|
||||||
|
|
||||||
return set_intersection(begin1, end1, begin2, end2, result,
|
return set_intersection(begin1, end1, begin2, end2, result,
|
||||||
std::less<value_type>());
|
std::less<value_type>());
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_set_difference(InputIterator begin1, InputIterator end1,
|
parallel_set_difference(InputIterator begin1, InputIterator end1,
|
||||||
InputIterator begin2, InputIterator end2,
|
InputIterator begin2, InputIterator end2,
|
||||||
OutputIterator result, Comparator comp)
|
OutputIterator result, Comparator comp)
|
||||||
{
|
{
|
||||||
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
||||||
difference_func<InputIterator, OutputIterator, Comparator>(comp));
|
difference_func<InputIterator, OutputIterator, Comparator>(comp));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InputIterator, typename OutputIterator, typename Comparator>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
typename OutputIterator,
|
||||||
|
typename Comparator>
|
||||||
OutputIterator
|
OutputIterator
|
||||||
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp)
|
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1,
|
||||||
|
InputIterator begin2, InputIterator end2,
|
||||||
|
OutputIterator result, Comparator comp)
|
||||||
{
|
{
|
||||||
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
return parallel_set_operation(begin1, end1, begin2, end2, result,
|
||||||
symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp));
|
symmetric_difference_func<InputIterator, OutputIterator, Comparator>
|
||||||
|
(comp));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // _GLIBCXX_SET_ALGORITHM_
|
#endif // _GLIBCXX_SET_ALGORITHM_
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,16 +44,19 @@
|
||||||
namespace __gnu_parallel
|
namespace __gnu_parallel
|
||||||
{
|
{
|
||||||
|
|
||||||
/** @brief Parallel std::unique_copy(), without explicit equality predicate.
|
/** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
|
||||||
* @param first Begin iterator of input sequence.
|
* @param first Begin iterator of input sequence.
|
||||||
* @param last End iterator of input sequence.
|
* @param last End iterator of input sequence.
|
||||||
* @param result Begin iterator of result sequence.
|
* @param result Begin iterator of result sequence.
|
||||||
* @param binary_pred Equality predicate.
|
* @param binary_pred Equality predicate.
|
||||||
* @return End iterator of result sequence. */
|
* @return End iterator of result sequence. */
|
||||||
template<typename InputIterator, class OutputIterator, class BinaryPredicate>
|
template<
|
||||||
|
typename InputIterator,
|
||||||
|
class OutputIterator,
|
||||||
|
class BinaryPredicate>
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
parallel_unique_copy(InputIterator first, InputIterator last,
|
parallel_unique_copy(InputIterator first, InputIterator last,
|
||||||
OutputIterator result, BinaryPredicate binary_pred)
|
OutputIterator result, BinaryPredicate binary_pred)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(last - first)
|
_GLIBCXX_CALL(last - first)
|
||||||
|
|
||||||
|
|
@ -62,126 +65,136 @@ namespace __gnu_parallel
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
difference_type size = last - first;
|
difference_type size = last - first;
|
||||||
int num_threads = __gnu_parallel::get_max_threads();
|
|
||||||
difference_type counter[num_threads + 1];
|
|
||||||
|
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
// Let the first thread process two parts.
|
// Let the first thread process two parts.
|
||||||
difference_type borders[num_threads + 2];
|
difference_type *counter;
|
||||||
__gnu_parallel::equally_split(size, num_threads + 1, borders);
|
difference_type *borders;
|
||||||
|
|
||||||
|
thread_index_t num_threads = get_max_threads();
|
||||||
// First part contains at least one element.
|
// First part contains at least one element.
|
||||||
#pragma omp parallel num_threads(num_threads)
|
# pragma omp parallel num_threads(num_threads)
|
||||||
{
|
{
|
||||||
int iam = omp_get_thread_num();
|
# pragma omp single
|
||||||
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
borders = new difference_type[num_threads + 2];
|
||||||
|
equally_split(size, num_threads + 1, borders);
|
||||||
|
counter = new difference_type[num_threads + 1];
|
||||||
|
}
|
||||||
|
|
||||||
difference_type begin, end;
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
// Check for length without duplicates
|
difference_type begin, end;
|
||||||
// Needed for position in output
|
|
||||||
difference_type i = 0;
|
|
||||||
OutputIterator out = result;
|
|
||||||
if (iam == 0)
|
|
||||||
{
|
|
||||||
begin = borders[0] + 1; // == 1
|
|
||||||
end = borders[iam + 1];
|
|
||||||
|
|
||||||
i++;
|
// Check for length without duplicates
|
||||||
new (static_cast<void *>(&*out)) value_type(*first);
|
// Needed for position in output
|
||||||
out++;
|
difference_type i = 0;
|
||||||
|
OutputIterator out = result;
|
||||||
|
|
||||||
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
if (iam == 0)
|
||||||
{
|
{
|
||||||
if (!binary_pred(*iter, *(iter-1)))
|
begin = borders[0] + 1; // == 1
|
||||||
{
|
end = borders[iam + 1];
|
||||||
i++;
|
|
||||||
new (static_cast<void *>(&*out)) value_type(*iter);
|
i++;
|
||||||
out++;
|
new (static_cast<void *>(&*out)) value_type(*first);
|
||||||
}
|
out++;
|
||||||
}
|
|
||||||
}
|
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
||||||
|
{
|
||||||
|
if (!binary_pred(*iter, *(iter-1)))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
new (static_cast<void *>(&*out)) value_type(*iter);
|
||||||
|
out++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
begin = borders[iam]; //one part
|
begin = borders[iam]; //one part
|
||||||
end = borders[iam + 1];
|
end = borders[iam + 1];
|
||||||
|
|
||||||
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
||||||
{
|
{
|
||||||
if (!binary_pred(*iter, *(iter-1)))
|
if (!binary_pred(*iter, *(iter-1)))
|
||||||
{
|
{
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counter[iam] = i;
|
counter[iam] = i;
|
||||||
|
|
||||||
// Last part still untouched.
|
// Last part still untouched.
|
||||||
difference_type begin_output;
|
difference_type begin_output;
|
||||||
|
|
||||||
#pragma omp barrier
|
# pragma omp barrier
|
||||||
|
|
||||||
// Store result in output on calculated positions.
|
// Store result in output on calculated positions.
|
||||||
begin_output = 0;
|
begin_output = 0;
|
||||||
|
|
||||||
if (iam == 0)
|
if (iam == 0)
|
||||||
{
|
{
|
||||||
for (int t = 0; t < num_threads; t++)
|
for (int t = 0; t < num_threads; t++)
|
||||||
begin_output += counter[t];
|
begin_output += counter[t];
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
|
|
||||||
OutputIterator iter_out = result + begin_output;
|
OutputIterator iter_out = result + begin_output;
|
||||||
|
|
||||||
begin = borders[num_threads];
|
begin = borders[num_threads];
|
||||||
end = size;
|
end = size;
|
||||||
|
|
||||||
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
||||||
{
|
{
|
||||||
if (iter == first || !binary_pred(*iter, *(iter-1)))
|
if (iter == first || !binary_pred(*iter, *(iter-1)))
|
||||||
{
|
{
|
||||||
i++;
|
i++;
|
||||||
new (static_cast<void *>(&*iter_out)) value_type(*iter);
|
new (static_cast<void *>(&*iter_out)) value_type(*iter);
|
||||||
iter_out++;
|
iter_out++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counter[num_threads] = i;
|
counter[num_threads] = i;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (int t = 0; t < iam; t++)
|
for (int t = 0; t < iam; t++)
|
||||||
begin_output += counter[t];
|
begin_output += counter[t];
|
||||||
|
|
||||||
OutputIterator iter_out = result + begin_output;
|
OutputIterator iter_out = result + begin_output;
|
||||||
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
for (InputIterator iter = first + begin; iter < first + end; ++iter)
|
||||||
{
|
{
|
||||||
if (!binary_pred(*iter, *(iter-1)))
|
if (!binary_pred(*iter, *(iter-1)))
|
||||||
{
|
{
|
||||||
new (static_cast<void *> (&*iter_out)) value_type(*iter);
|
new (static_cast<void *> (&*iter_out)) value_type(*iter);
|
||||||
iter_out++;
|
iter_out++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
difference_type end_output = 0;
|
difference_type end_output = 0;
|
||||||
for (int t = 0; t < num_threads + 1; t++)
|
for (int t = 0; t < num_threads + 1; t++)
|
||||||
end_output += counter[t];
|
end_output += counter[t];
|
||||||
|
|
||||||
|
delete[] borders;
|
||||||
|
|
||||||
return result + end_output;
|
return result + end_output;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Parallel std::unique_copy(), without explicit equality predicate
|
/** @brief Parallel std::unique_copy(), without explicit equality predicate
|
||||||
* @param first Begin iterator of input sequence.
|
* @param first Begin iterator of input sequence.
|
||||||
* @param last End iterator of input sequence.
|
* @param last End iterator of input sequence.
|
||||||
* @param result Begin iterator of result sequence.
|
* @param result Begin iterator of result sequence.
|
||||||
* @return End iterator of result sequence. */
|
* @return End iterator of result sequence. */
|
||||||
template<typename InputIterator, class OutputIterator>
|
template<typename InputIterator, class OutputIterator>
|
||||||
inline OutputIterator
|
inline OutputIterator
|
||||||
parallel_unique_copy(InputIterator first, InputIterator last,
|
parallel_unique_copy(InputIterator first, InputIterator last,
|
||||||
OutputIterator result)
|
OutputIterator result)
|
||||||
{
|
{
|
||||||
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
|
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,8 +55,8 @@ namespace __gnu_parallel
|
||||||
|
|
||||||
#define _GLIBCXX_JOB_VOLATILE volatile
|
#define _GLIBCXX_JOB_VOLATILE volatile
|
||||||
|
|
||||||
/** @brief One job for a certain thread. */
|
/** @brief One job for a certain thread. */
|
||||||
template<typename _DifferenceTp>
|
template<typename _DifferenceTp>
|
||||||
struct Job
|
struct Job
|
||||||
{
|
{
|
||||||
typedef _DifferenceTp difference_type;
|
typedef _DifferenceTp difference_type;
|
||||||
|
|
@ -78,31 +78,38 @@ namespace __gnu_parallel
|
||||||
_GLIBCXX_JOB_VOLATILE difference_type load;
|
_GLIBCXX_JOB_VOLATILE difference_type load;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @brief Work stealing algorithm for random access iterators.
|
/** @brief Work stealing algorithm for random access iterators.
|
||||||
*
|
*
|
||||||
* Uses O(1) additional memory. Synchronization at job lists is
|
* Uses O(1) additional memory. Synchronization at job lists is
|
||||||
* done with atomic operations.
|
* done with atomic operations.
|
||||||
* @param begin Begin iterator of element sequence.
|
* @param begin Begin iterator of element sequence.
|
||||||
* @param end End iterator of element sequence.
|
* @param end End iterator of element sequence.
|
||||||
* @param op User-supplied functor (comparator, predicate, adding
|
* @param op User-supplied functor (comparator, predicate, adding
|
||||||
* functor, ...).
|
* functor, ...).
|
||||||
* @param f Functor to "process" an element with op (depends on
|
* @param f Functor to "process" an element with op (depends on
|
||||||
* desired functionality, e. g. for std::for_each(), ...).
|
* desired functionality, e. g. for std::for_each(), ...).
|
||||||
* @param r Functor to "add" a single result to the already
|
* @param r Functor to "add" a single result to the already
|
||||||
* processed elements (depends on functionality).
|
* processed elements (depends on functionality).
|
||||||
* @param base Base value for reduction.
|
* @param base Base value for reduction.
|
||||||
* @param output Pointer to position where final result is written to
|
* @param output Pointer to position where final result is written to
|
||||||
* @param bound Maximum number of elements processed (e. g. for
|
* @param bound Maximum number of elements processed (e. g. for
|
||||||
* std::count_n()).
|
* std::count_n()).
|
||||||
* @return User-supplied functor (that may contain a part of the result).
|
* @return User-supplied functor (that may contain a part of the result).
|
||||||
*/
|
*/
|
||||||
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
|
template<
|
||||||
|
typename RandomAccessIterator,
|
||||||
|
typename Op,
|
||||||
|
typename Fu,
|
||||||
|
typename Red,
|
||||||
|
typename Result>
|
||||||
Op
|
Op
|
||||||
for_each_template_random_access_workstealing(RandomAccessIterator begin,
|
for_each_template_random_access_workstealing(
|
||||||
RandomAccessIterator end,
|
RandomAccessIterator begin,
|
||||||
Op op, Fu& f, Red r,
|
RandomAccessIterator end,
|
||||||
Result base, Result& output,
|
Op op, Fu& f, Red r,
|
||||||
typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
|
Result base, Result& output,
|
||||||
|
typename std::iterator_traits<RandomAccessIterator>::difference_type
|
||||||
|
bound)
|
||||||
{
|
{
|
||||||
_GLIBCXX_CALL(end - begin)
|
_GLIBCXX_CALL(end - begin)
|
||||||
|
|
||||||
|
|
@ -110,182 +117,187 @@ namespace __gnu_parallel
|
||||||
typedef typename traits_type::difference_type difference_type;
|
typedef typename traits_type::difference_type difference_type;
|
||||||
|
|
||||||
|
|
||||||
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
|
difference_type chunk_size =
|
||||||
|
static_cast<difference_type>(Settings::workstealing_chunk_size);
|
||||||
|
|
||||||
// How many jobs?
|
// How many jobs?
|
||||||
difference_type length = (bound < 0) ? (end - begin) : bound;
|
difference_type length = (bound < 0) ? (end - begin) : bound;
|
||||||
|
|
||||||
// To avoid false sharing in a cache line.
|
// To avoid false sharing in a cache line.
|
||||||
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
|
const int stride =
|
||||||
|
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
|
||||||
|
|
||||||
// Total number of threads currently working.
|
// Total number of threads currently working.
|
||||||
thread_index_t busy = 0;
|
thread_index_t busy = 0;
|
||||||
thread_index_t num_threads = get_max_threads();
|
|
||||||
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
|
Job<difference_type> *job;
|
||||||
|
|
||||||
omp_lock_t output_lock;
|
omp_lock_t output_lock;
|
||||||
omp_init_lock(&output_lock);
|
omp_init_lock(&output_lock);
|
||||||
|
|
||||||
// No more threads than jobs, at least one thread.
|
|
||||||
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
|
|
||||||
num_threads = static_cast<thread_index_t>(num_threads_max);
|
|
||||||
|
|
||||||
// Create job description array.
|
|
||||||
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
|
|
||||||
|
|
||||||
// Write base value to output.
|
// Write base value to output.
|
||||||
output = base;
|
output = base;
|
||||||
|
|
||||||
#pragma omp parallel shared(busy) num_threads(num_threads)
|
// No more threads than jobs, at least one thread.
|
||||||
{
|
thread_index_t num_threads =
|
||||||
// Initialization phase.
|
__gnu_parallel::max<thread_index_t>(1,
|
||||||
|
__gnu_parallel::min<difference_type>(length, get_max_threads()));
|
||||||
|
|
||||||
// Flags for every thread if it is doing productive work.
|
# pragma omp parallel shared(busy) num_threads(num_threads)
|
||||||
bool iam_working = false;
|
{
|
||||||
|
|
||||||
// Thread id.
|
# pragma omp single
|
||||||
thread_index_t iam = omp_get_thread_num();
|
{
|
||||||
|
num_threads = omp_get_num_threads();
|
||||||
|
|
||||||
// This job.
|
// Create job description array.
|
||||||
Job<difference_type>& my_job = job[iam * stride];
|
job = new Job<difference_type>[num_threads * stride];
|
||||||
|
}
|
||||||
|
|
||||||
// Random number (for work stealing).
|
// Initialization phase.
|
||||||
thread_index_t victim;
|
|
||||||
|
|
||||||
// Local value for reduction.
|
// Flags for every thread if it is doing productive work.
|
||||||
Result result = Result();
|
bool iam_working = false;
|
||||||
|
|
||||||
// Number of elements to steal in one attempt.
|
// Thread id.
|
||||||
difference_type steal;
|
thread_index_t iam = omp_get_thread_num();
|
||||||
|
|
||||||
// Every thread has its own random number generator (modulo num_threads).
|
// This job.
|
||||||
random_number rand_gen(iam, num_threads);
|
Job<difference_type>& my_job = job[iam * stride];
|
||||||
|
|
||||||
#pragma omp atomic
|
// Random number (for work stealing).
|
||||||
// This thread is currently working.
|
thread_index_t victim;
|
||||||
busy++;
|
|
||||||
|
|
||||||
iam_working = true;
|
// Local value for reduction.
|
||||||
|
Result result = Result();
|
||||||
|
|
||||||
// How many jobs per thread? last thread gets the rest.
|
// Number of elements to steal in one attempt.
|
||||||
my_job.first = static_cast<difference_type>(iam * (length / num_threads));
|
difference_type steal;
|
||||||
|
|
||||||
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
|
// Every thread has its own random number generator
|
||||||
my_job.load = my_job.last - my_job.first + 1;
|
// (modulo num_threads).
|
||||||
|
random_number rand_gen(iam, num_threads);
|
||||||
|
|
||||||
// Init result with first value (to have a base value for reduction).
|
// This thread is currently working.
|
||||||
if (my_job.first <= my_job.last)
|
# pragma omp atomic
|
||||||
{
|
busy++;
|
||||||
// Cannot use volatile variable directly.
|
|
||||||
difference_type my_first = my_job.first;
|
|
||||||
result = f(op, begin + my_first);
|
|
||||||
my_job.first++;
|
|
||||||
my_job.load--;
|
|
||||||
}
|
|
||||||
|
|
||||||
RandomAccessIterator current;
|
iam_working = true;
|
||||||
|
|
||||||
#pragma omp barrier
|
// How many jobs per thread? last thread gets the rest.
|
||||||
|
my_job.first =
|
||||||
|
static_cast<difference_type>(iam * (length / num_threads));
|
||||||
|
|
||||||
// Actual work phase
|
my_job.last = (iam == (num_threads - 1)) ?
|
||||||
// Work on own or stolen start
|
(length - 1) : ((iam + 1) * (length / num_threads) - 1);
|
||||||
while (busy > 0)
|
my_job.load = my_job.last - my_job.first + 1;
|
||||||
{
|
|
||||||
// Work until no productive thread left.
|
|
||||||
#pragma omp flush(busy)
|
|
||||||
|
|
||||||
// Thread has own work to do
|
// Init result with first value (to have a base value for reduction).
|
||||||
while (my_job.first <= my_job.last)
|
if (my_job.first <= my_job.last)
|
||||||
{
|
{
|
||||||
// fetch-and-add call
|
// Cannot use volatile variable directly.
|
||||||
// Reserve current job block (size chunk_size) in my queue.
|
difference_type my_first = my_job.first;
|
||||||
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
|
result = f(op, begin + my_first);
|
||||||
|
my_job.first++;
|
||||||
|
my_job.load--;
|
||||||
|
}
|
||||||
|
|
||||||
// Update load, to make the three values consistent,
|
RandomAccessIterator current;
|
||||||
// first might have been changed in the meantime
|
|
||||||
my_job.load = my_job.last - my_job.first + 1;
|
|
||||||
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
|
|
||||||
{
|
|
||||||
// Yes: process it!
|
|
||||||
current = begin + current_job;
|
|
||||||
current_job++;
|
|
||||||
|
|
||||||
// Do actual work.
|
# pragma omp barrier
|
||||||
result = r(result, f(op, current));
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp flush(busy)
|
// Actual work phase
|
||||||
|
// Work on own or stolen start
|
||||||
|
while (busy > 0)
|
||||||
|
{
|
||||||
|
// Work until no productive thread left.
|
||||||
|
# pragma omp flush(busy)
|
||||||
|
|
||||||
}
|
// Thread has own work to do
|
||||||
|
while (my_job.first <= my_job.last)
|
||||||
|
{
|
||||||
|
// fetch-and-add call
|
||||||
|
// Reserve current job block (size chunk_size) in my queue.
|
||||||
|
difference_type current_job =
|
||||||
|
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
|
||||||
|
|
||||||
// After reaching this point, a thread's job list is empty.
|
// Update load, to make the three values consistent,
|
||||||
if (iam_working)
|
// first might have been changed in the meantime
|
||||||
{
|
my_job.load = my_job.last - my_job.first + 1;
|
||||||
#pragma omp atomic
|
for (difference_type job_counter = 0;
|
||||||
// This thread no longer has work.
|
job_counter < chunk_size && current_job <= my_job.last;
|
||||||
busy--;
|
job_counter++)
|
||||||
|
{
|
||||||
|
// Yes: process it!
|
||||||
|
current = begin + current_job;
|
||||||
|
current_job++;
|
||||||
|
|
||||||
iam_working = false;
|
// Do actual work.
|
||||||
}
|
result = r(result, f(op, current));
|
||||||
|
}
|
||||||
|
|
||||||
difference_type supposed_first, supposed_last, supposed_load;
|
# pragma omp flush(busy)
|
||||||
do
|
}
|
||||||
{
|
|
||||||
// Find random nonempty deque (not own) and do consistency check.
|
|
||||||
yield();
|
|
||||||
#pragma omp flush(busy)
|
|
||||||
victim = rand_gen();
|
|
||||||
supposed_first = job[victim * stride].first;
|
|
||||||
supposed_last = job[victim * stride].last;
|
|
||||||
supposed_load = job[victim * stride].load;
|
|
||||||
}
|
|
||||||
while (busy > 0
|
|
||||||
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
|
|
||||||
|
|
||||||
if (busy == 0)
|
// After reaching this point, a thread's job list is empty.
|
||||||
break;
|
if (iam_working)
|
||||||
|
{
|
||||||
|
// This thread no longer has work.
|
||||||
|
# pragma omp atomic
|
||||||
|
busy--;
|
||||||
|
|
||||||
if (supposed_load > 0)
|
iam_working = false;
|
||||||
{
|
}
|
||||||
// Has work and work to do.
|
|
||||||
// Number of elements to steal (at least one).
|
|
||||||
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
|
|
||||||
|
|
||||||
// Protects against stealing threads
|
difference_type supposed_first, supposed_last, supposed_load;
|
||||||
// omp_set_lock(&(job[victim * stride].lock));
|
do
|
||||||
|
{
|
||||||
|
// Find random nonempty deque (not own), do consistency check.
|
||||||
|
yield();
|
||||||
|
# pragma omp flush(busy)
|
||||||
|
victim = rand_gen();
|
||||||
|
supposed_first = job[victim * stride].first;
|
||||||
|
supposed_last = job[victim * stride].last;
|
||||||
|
supposed_load = job[victim * stride].load;
|
||||||
|
}
|
||||||
|
while (busy > 0
|
||||||
|
&& ((supposed_load <= 0)
|
||||||
|
|| ((supposed_first + supposed_load - 1) != supposed_last)));
|
||||||
|
|
||||||
// Push victim's start forward.
|
if (busy == 0)
|
||||||
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
|
break;
|
||||||
difference_type stolen_try = stolen_first + steal - difference_type(1);
|
|
||||||
|
|
||||||
// Protects against working thread
|
if (supposed_load > 0)
|
||||||
// omp_unset_lock(&(job[victim * stride].lock));
|
{
|
||||||
|
// Has work and work to do.
|
||||||
|
// Number of elements to steal (at least one).
|
||||||
|
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
|
||||||
|
|
||||||
my_job.first = stolen_first;
|
// Push victim's start forward.
|
||||||
|
difference_type stolen_first =
|
||||||
// Avoid std::min dependencies.
|
fetch_and_add<difference_type>(
|
||||||
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
|
&(job[victim * stride].first), steal);
|
||||||
|
difference_type stolen_try =
|
||||||
|
stolen_first + steal - difference_type(1);
|
||||||
|
|
||||||
my_job.load = my_job.last - my_job.first + 1;
|
my_job.first = stolen_first;
|
||||||
|
my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
|
||||||
|
my_job.load = my_job.last - my_job.first + 1;
|
||||||
|
|
||||||
//omp_unset_lock(&(my_job.lock));
|
// Has potential work again.
|
||||||
|
# pragma omp atomic
|
||||||
|
busy++;
|
||||||
|
iam_working = true;
|
||||||
|
|
||||||
#pragma omp atomic
|
# pragma omp flush(busy)
|
||||||
// Has potential work again.
|
}
|
||||||
busy++;
|
# pragma omp flush(busy)
|
||||||
iam_working = true;
|
} // end while busy > 0
|
||||||
|
// Add accumulated result to output.
|
||||||
#pragma omp flush(busy)
|
omp_set_lock(&output_lock);
|
||||||
}
|
output = r(output, result);
|
||||||
#pragma omp flush(busy)
|
omp_unset_lock(&output_lock);
|
||||||
} // end while busy > 0
|
}
|
||||||
// Add accumulated result to output.
|
|
||||||
omp_set_lock(&output_lock);
|
|
||||||
output = r(output, result);
|
|
||||||
omp_unset_lock(&output_lock);
|
|
||||||
|
|
||||||
//omp_destroy_lock(&(my_job.lock));
|
|
||||||
}
|
|
||||||
|
|
||||||
delete[] job;
|
delete[] job;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue