17#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
18#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
32#define HAVE_AVX2SORT 0
35#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
45#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
46#include "third_party/ips4o/include/ips4o.hpp"
47#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
50#include "third_party/boost/allowed/sort/sort.hpp"
65#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
66 apply_to = any(function))
68#pragma clang attribute push(__attribute__((target("avx2"))), \
69 apply_to = any(function))
73#pragma GCC push_options
75#pragma GCC target("avx512f,avx512dq")
77#pragma GCC target("avx2")
83#include "vxsort/machine_traits.avx512.h"
85#include "vxsort/machine_traits.avx2.h"
87#include "vxsort/vxsort.h"
90#pragma clang attribute pop
92#pragma GCC pop_options
114 return "unreachable";
127 static_assert(
sizeof(T) <= 8,
"Expected a built-in type");
128 CopyBytes<sizeof(T)>(&value, &bits);
136 static_cast<int>(other.
count_));
140 HWY_ABORT(
"minmax %f/%f vs %f/%f\n",
static_cast<double>(
min_),
141 static_cast<double>(
max_),
static_cast<double>(other.
min_),
142 static_cast<double>(other.
max_));
147 HWY_ABORT(
"Sum mismatch %g %g; min %g max %g\n",
148 static_cast<double>(
sum_),
static_cast<double>(other.
sum_),
149 static_cast<double>(
min_),
static_cast<double>(
max_));
156 T
min_ = hwy::HighestValue<T>();
157 T
max_ = hwy::LowestValue<T>();
169#if HAVE_PARALLEL_IPS4O
196#if HAVE_PARALLEL_IPS4O
197 case Algo::kParallelIPS4O:
219 return "unreachable";
226#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
227 defined(HWY_TARGET_TOGGLE)
228#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
229#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
231#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
245 z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
246 z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
247 return z ^ (z >> 31);
253 template <
class DU64>
256 for (
size_t i = 1; i < 2 *
Lanes(du64); ++i) {
262 template <
class VU64>
266 const VU64 bits = Add(s1, s0);
268 s1 =
Xor(s1, ShiftLeft<23>(s1));
269 state1 =
Xor(s1,
Xor(s0,
Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
274template <
class D,
class VU64, HWY_IF_NOT_FLOAT_D(D)>
282template <
class DF,
class VU64, HWY_IF_FLOAT_D(DF)>
286 using VU =
Vec<
decltype(du)>;
290#if HWY_TARGET == HWY_SCALAR
292 const VU bits =
Set(du,
static_cast<TU
>(
GetLane(bits64) & LimitsMax<TU>()));
294 const VU bits =
BitCast(du, bits64);
299 const VU mantissa_mask =
Set(du, MantissaMask<TF>());
300 const VU representation =
OrAnd(k1, bits, mantissa_mask);
301 return BitCast(df, representation);
309 : 0xFFFFFFFFFFFFFFFFull);
313 : 0xFFFFFFFFFFFFFFFFull);
317 : 0x00000000FFFFFFFFull);
327 using VU64 =
Vec<
decltype(du64)>;
328 const size_t N64 =
Lanes(du64);
329 auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
331 VU64 s0 =
Load(du64, seeds.get());
332 VU64 s1 =
Load(du64, seeds.get() + N64);
334#if HWY_TARGET == HWY_SCALAR
339 using V =
Vec<
decltype(
d)>;
341 const VU64 mask =
MaskForDist(du64, dist,
sizeof(T));
342 auto buf = hwy::AllocateAligned<T>(
N);
345 for (; i +
N <= num; i +=
N) {
352 memcpy(
v + i, buf.get(), (num - i) *
sizeof(T));
356 for (
size_t i = 0; i < num; ++i) {
367#if HAVE_PARALLEL_IPS4O
368 const unsigned max_threads = hwy::LimitsMax<unsigned>();
369 ips4o::StdThreadPool pool{
static_cast<int>(
370 HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
372 std::vector<ThreadLocal>
tls{1};
377template <
class Order,
typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
381 if (Order().IsAscending()) {
382 const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
385 const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
391template <
class Order>
393 using detail::SharedTraits;
394 using detail::Traits128;
395 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
396 const size_t num_lanes = num_keys * 2;
397 if (Order().IsAscending()) {
398 const SharedTraits<Traits128<detail::OrderAscending128>> st;
401 const SharedTraits<Traits128<detail::OrderDescending128>> st;
406template <
class Order>
408 using detail::SharedTraits;
409 using detail::Traits128;
410 uint64_t* lanes =
reinterpret_cast<uint64_t*
>(keys);
411 const size_t num_lanes = num_keys * 2;
412 if (Order().IsAscending()) {
413 const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
416 const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
422template <
class Order,
typename KeyType>
425 const std::less<KeyType> less;
426 const std::greater<KeyType> greater;
431 return avx2::quicksort(inout,
static_cast<int>(num));
436 if (Order().IsAscending()) {
437 return ips4o::sort(inout, inout + num, less);
439 return ips4o::sort(inout, inout + num, greater);
443#if HAVE_PARALLEL_IPS4O
444 case Algo::kParallelIPS4O:
445 if (Order().IsAscending()) {
446 return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
448 return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
460 if (Order().IsAscending()) {
461 return boost::sort::pdqsort_branchless(inout, inout + num, less);
463 return boost::sort::pdqsort_branchless(inout, inout + num, greater);
468 case Algo::kVXSort: {
469#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
470 (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
471 fprintf(stderr,
"Do not call for target %s\n",
476 vxsort::vxsort<KeyType, vxsort::AVX512> vx;
478 vxsort::vxsort<KeyType, vxsort::AVX2> vx;
480 if (Order().IsAscending()) {
481 return vx.sort(inout, inout + num - 1);
483 fprintf(stderr,
"Skipping VX - does not support descending order\n");
491 if (Order().IsAscending()) {
492 return std::sort(inout, inout + num, less);
494 return std::sort(inout, inout + num, greater);
498 return shared.
tls[thread].sorter(inout, num, Order());
501 return CallHeapSort<Order>(inout, num);
#define HWY_RESTRICT
Definition base.h:64
#define HWY_POP_ATTRIBUTES
Definition base.h:123
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_ABORT(format,...)
Definition base.h:188
#define HWY_INLINE
Definition base.h:70
#define HWY_PUSH_ATTRIBUTES(targets_str)
Definition base.h:122
Definition algo-inl.h:243
static void GenerateSeeds(DU64 du64, TFromD< DU64 > *HWY_RESTRICT seeds)
Definition algo-inl.h:254
static HWY_INLINE uint64_t SplitMix64(uint64_t z)
Definition algo-inl.h:244
static VU64 RandomBits(VU64 &state0, VU64 &state1)
Definition algo-inl.h:263
#define HWY_TARGET
Definition detect_targets.h:380
void HeapSort(Traits st, T *HWY_RESTRICT lanes, const size_t num_lanes)
Definition vqsort-inl.h:127
d
Definition rvv-inl.h:1998
InputStats< T > GenerateInput(const Dist dist, T *v, size_t num)
Definition algo-inl.h:325
void CallHeapSort(KeyType *HWY_RESTRICT keys, const size_t num_keys)
Definition algo-inl.h:378
void Run(Algo algo, KeyType *HWY_RESTRICT inout, size_t num, SharedState &shared, size_t thread)
Definition algo-inl.h:423
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
Vec< DU64 > MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t)
Definition algo-inl.h:305
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
N
Definition rvv-inl.h:1998
ScalableTag< T, -1 > SortTag
Definition contrib/sort/shared-inl.h:124
Vec< D > RandomValues(D d, VU64 &s0, VU64 &s1, const VU64 mask)
Definition algo-inl.h:275
const vfloat64m1_t v
Definition rvv-inl.h:1998
typename D::T TFromD
Definition ops/shared-inl.h:203
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
Definition aligned_allocator.h:27
static const char * DistName(Dist dist)
Definition algo-inl.h:105
Dist
Definition algo-inl.h:99
static std::vector< Dist > AllDist()
Definition algo-inl.h:101
static const char * AlgoName(Algo algo)
Definition algo-inl.h:186
static HWY_MAYBE_UNUSED const char * TargetName(int64_t target)
Definition targets.h:85
Algo
Definition algo-inl.h:162
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition algo-inl.h:366
std::vector< ThreadLocal > tls
Definition algo-inl.h:372
Definition ops/shared-inl.h:52
Definition algo-inl.h:362
Sorter sorter
Definition algo-inl.h:363
Definition sorting_networks-inl.h:698
Definition traits-inl.h:545