29#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
30#define HWY_SVE_IS_POW2 1
32#define HWY_SVE_IS_POW2 0
55#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
56#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
57#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
58 X_MACRO(uint, u, 32, 16, NAME, OP)
59#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
60 X_MACRO(uint, u, 64, 32, NAME, OP)
63#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
64#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
65#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
66#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
69#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
70 X_MACRO(float, f, 16, 16, NAME, OP)
71#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
72 X_MACRO(float, f, 32, 16, NAME, OP)
73#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
74 X_MACRO(float, f, 64, 32, NAME, OP)
77#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
79 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
80 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
81 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
83#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
85 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
86 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
87 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
89#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
90 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
91 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
92 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
95#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
96 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
97 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
99#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
100 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
101 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
103#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
104 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
105 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
107#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
108 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
109 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
111#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
113 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
114 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
115 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
118#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
119 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
120 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
122#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
123 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
124 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
126#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
127 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
128 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
129 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
132#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
133#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
134#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
138#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
140 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
141 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
151#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
152 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
153 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
155#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
156 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
157 return sv##OP##_##CHAR##BITS(v); \
161#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
162 HWY_API HWY_SVE_V(BASE, BITS) \
163 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
164 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
166#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
167 HWY_API HWY_SVE_V(BASE, BITS) \
168 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
169 return sv##OP##_##CHAR##BITS(a, b); \
173#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
174 HWY_API HWY_SVE_V(BASE, BITS) \
175 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
176 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
178#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
179 HWY_API HWY_SVE_V(BASE, BITS) \
180 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
181 return sv##OP##_##CHAR##BITS(a, b); \
184#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
185 HWY_API HWY_SVE_V(BASE, BITS) \
186 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
187 HWY_SVE_V(BASE, BITS) c) { \
188 return sv##OP##_##CHAR##BITS(a, b, c); \
197 return svcntb_pat(SV_ALL);
200 return svcnth_pat(SV_ALL);
203 return svcntw_pat(SV_ALL);
206 return svcntd_pat(SV_ALL);
210#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
213#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
215#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
218template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
220 return svcntb_pat(SV_POW2);
222template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
224 return svcnth_pat(SV_POW2);
226template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
228 return svcntw_pat(SV_POW2);
230template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
232 return svcntd_pat(SV_POW2);
241#if HWY_TARGET == HWY_SVE_256
242template <
typename T,
size_t N,
int kPow2>
246#elif HWY_TARGET == HWY_SVE2_128
247template <
typename T,
size_t N,
int kPow2>
252template <
typename T,
size_t N,
int kPow2>
254 const size_t actual = detail::HardwareLanes<T>();
266#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
267 template <size_t N, int kPow2> \
268 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
269 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
270 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
280#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
281 template <size_t N, int kPow2> \
282 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
283 return HWY_SVE_PTRUE(BITS); \
285 template <size_t N, int kPow2> \
286 HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
287 return HWY_SVE_ALL_PTRUE(BITS); \
291#undef HWY_SVE_WRAP_PTRUE
310#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
311 template <size_t N, int kPow2> \
312 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
313 HWY_SVE_T(BASE, BITS) arg) { \
314 return sv##OP##_##CHAR##BITS(arg); \
321template <
size_t N,
int kPow2>
327using VFromD =
decltype(
Set(D(), TFromD<D>()));
340#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
341 template <size_t N, int kPow2> \
342 HWY_API HWY_SVE_V(BASE, BITS) \
343 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
344 return sv##OP##_##CHAR##BITS(); \
354#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
355 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
358 template <size_t N, int kPow2> \
359 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
360 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
365#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
366 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
367 return sv##OP##_u8_##CHAR##BITS(v); \
369 template <size_t N, int kPow2> \
370 HWY_INLINE HWY_SVE_V(BASE, BITS) \
371 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) , svuint8_t v) { \
372 return sv##OP##_##CHAR##BITS##_u8(v); \
382#undef HWY_SVE_CAST_NOP
385template <
size_t N,
int kPow2>
393template <
class D,
class FromV>
413template <
class V, HWY_IF_FLOAT_V(V)>
424template <
class V, HWY_IF_FLOAT_V(V)>
439template <
class V, HWY_IF_FLOAT_V(V)>
449#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
450 HWY_API HWY_SVE_V(BASE, BITS) \
451 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
452 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
456#undef HWY_SVE_RETV_ARGPVN_SWAP
459#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
460 HWY_API HWY_SVE_V(BASE, BITS) \
461 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
462 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
465#undef HWY_SVE_RETV_ARGPVV_SWAP
467template <
class V, HWY_IF_FLOAT_V(V)>
476#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
480template <
class V, HWY_IF_FLOAT_V(V)>
490 return Xor(x1,
Xor(x2, x3));
497 return Or(o1,
Or(o2, o3));
503 return Or(o,
And(a1, a2));
508#ifdef HWY_NATIVE_POPCNT
509#undef HWY_NATIVE_POPCNT
511#define HWY_NATIVE_POPCNT
515#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
516 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
517 return BitCast(DFromV<decltype(v)>(), \
518 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
542 return Or(abs,
And(msb, sign));
559#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
560 HWY_API HWY_SVE_V(BASE, BITS) \
561 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
562 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
566#undef HWY_SVE_RETV_ARGPVN_MASK
575 const svbool_t pg = detail::PTrue(du64);
577 const svuint32_t sums_of_4 = svdot_n_u32(
Zero(du32),
v, 1);
580 const svuint64_t hi = svlsr_n_u64_x(pg,
BitCast(du64, sums_of_4), 32);
582 const svuint64_t lo = svextw_u64_x(pg,
BitCast(du64, sums_of_4));
601#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
602 template <int kBits> \
603 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
604 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
606 HWY_API HWY_SVE_V(BASE, BITS) \
607 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
608 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
618#undef HWY_SVE_SHIFT_N
623template <
int kBits,
class V>
625 constexpr size_t kSizeInBits =
sizeof(
TFromV<V>) * 8;
626 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
627 if (kBits == 0)
return v;
628 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
633#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
634 HWY_API HWY_SVE_V(BASE, BITS) \
635 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
636 const RebindToUnsigned<DFromV<decltype(v)>> du; \
637 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
638 BitCast(du, bits)); \
665#ifdef HWY_NATIVE_I64MULLO
666#undef HWY_NATIVE_I64MULLO
668#define HWY_NATIVE_I64MULLO
679#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
680 return svqrdmulh_s16(a, b);
685 const svuint16_t lo =
BitCast(du, Mul(a, b));
686 const svint16_t hi =
MulHigh(a, b);
690 const svuint16_t lo_top2 = ShiftRight<14>(lo);
692 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
693 return Add(Add(hi, hi),
BitCast(
d, rounding));
710#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
711 HWY_API HWY_SVE_V(BASE, BITS) \
712 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
713 HWY_SVE_V(BASE, BITS) add) { \
714 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
740template <
class D,
typename MFrom>
754 return svand_b_z(b, b, a);
757 return svbic_b_z(b, b, a);
760 return svsel_b(a, a, b);
763 return svsel_b(a, svnand_b_z(a, a, b), b);
772#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
773 template <size_t N, int kPow2> \
774 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
775 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
779#undef HWY_SVE_COUNT_TRUE
784#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
785 template <size_t N, int kPow2> \
786 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
787 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
791#undef HWY_SVE_COUNT_TRUE_FULL
811 :
static_cast<intptr_t
>(
822#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
823 HWY_API HWY_SVE_V(BASE, BITS) \
824 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
825 return sv##OP##_##CHAR##BITS(m, yes, no); \
829#undef HWY_SVE_IF_THEN_ELSE
846#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
847 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
848 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
850#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
851 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
852 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
876#undef HWY_SVE_COMPARE
877#undef HWY_SVE_COMPARE_N
892 return detail::NeN(
And(a, bit), 0);
898 return detail::NeN(
v,
static_cast<TFromV<V>>(0));
912#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
914#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \
915 HWY_API HWY_SVE_V(BASE, BITS) \
916 NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
917 HWY_SVE_V(BASE, BITS) no) { \
918 return sv##OP##_##CHAR##BITS(yes, no, mask); \
924template <
class V, HWY_IF_FLOAT_V(V)>
955 return RebindMask(
d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
969 const VFromD<
decltype(di)> exp =
971 return RebindMask(
d, detail::LtN(exp, hwy::MaxExponentField<T>()));
978#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
979 template <size_t N, int kPow2> \
980 HWY_API HWY_SVE_V(BASE, BITS) \
981 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
982 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
983 return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
986#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
987 template <size_t N, int kPow2> \
988 HWY_API HWY_SVE_V(BASE, BITS) \
989 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) , \
990 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
991 return sv##OP##_##CHAR##BITS(m, p); \
994#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
995 template <size_t N, int kPow2> \
996 HWY_API HWY_SVE_V(BASE, BITS) \
997 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
998 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1000 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
1003#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1004 template <size_t N, int kPow2> \
1005 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1006 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1007 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1008 sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
1011#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1012 template <size_t N, int kPow2> \
1013 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1014 HWY_SVE_D(BASE, BITS, N, kPow2) , \
1015 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1016 sv##OP##_##CHAR##BITS(m, p, v); \
1027#undef HWY_SVE_MASKED_LOAD
1028#undef HWY_SVE_LOAD_DUP128
1030#undef HWY_SVE_BLENDED_STORE
1033template <
size_t N,
int kPow2>
1040template <
size_t N,
int kPow2>
1056template <
class V,
class D>
1063#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1064 template <size_t N, int kPow2> \
1065 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1066 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1067 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1068 HWY_SVE_V(int, BITS) offset) { \
1069 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
1073#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1074 template <size_t N, int kPow2> \
1075 HWY_API void NAME( \
1076 HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1077 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
1078 sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
1083#undef HWY_SVE_SCATTER_OFFSET
1084#undef HWY_SVE_SCATTER_INDEX
1088#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
1089 template <size_t N, int kPow2> \
1090 HWY_API HWY_SVE_V(BASE, BITS) \
1091 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1092 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1093 HWY_SVE_V(int, BITS) offset) { \
1094 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
1097#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1098 template <size_t N, int kPow2> \
1099 HWY_API HWY_SVE_V(BASE, BITS) \
1100 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1101 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1102 HWY_SVE_V(int, BITS) index) { \
1103 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
1109#undef HWY_SVE_GATHER_OFFSET
1110#undef HWY_SVE_GATHER_INDEX
1115#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1116#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1118#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1121#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \
1122 template <size_t N, int kPow2> \
1123 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1124 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1125 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1126 const sv##BASE##BITS##x2_t tuple = \
1127 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1128 v0 = svget2(tuple, 0); \
1129 v1 = svget2(tuple, 1); \
1137#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \
1138 template <size_t N, int kPow2> \
1139 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1140 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1141 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1142 HWY_SVE_V(BASE, BITS) & v2) { \
1143 const sv##BASE##BITS##x3_t tuple = \
1144 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1145 v0 = svget3(tuple, 0); \
1146 v1 = svget3(tuple, 1); \
1147 v2 = svget3(tuple, 2); \
1155#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \
1156 template <size_t N, int kPow2> \
1157 HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1158 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1159 HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1160 HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1161 const sv##BASE##BITS##x4_t tuple = \
1162 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1163 v0 = svget4(tuple, 0); \
1164 v1 = svget4(tuple, 1); \
1165 v2 = svget4(tuple, 2); \
1166 v3 = svget4(tuple, 3); \
1174#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1175 template <size_t N, int kPow2> \
1176 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1177 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1178 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1179 const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
1180 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple); \
1184#undef HWY_SVE_STORE2
1188#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
1189 template <size_t N, int kPow2> \
1190 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1191 HWY_SVE_V(BASE, BITS) v2, \
1192 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1193 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1194 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
1195 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
1199#undef HWY_SVE_STORE3
1203#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
1204 template <size_t N, int kPow2> \
1205 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1206 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1207 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1208 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1209 const sv##BASE##BITS##x4_t quad = \
1210 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1211 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1215#undef HWY_SVE_STORE4
1222#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1223 template <size_t N, int kPow2> \
1224 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1225 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, HALF) v) { \
1226 return sv##OP##_##CHAR##BITS(v); \
1234template <
size_t N,
int kPow2>
1239template <
size_t N,
int kPow2>
1246template <
size_t N,
int kPow2>
1251template <
size_t N,
int kPow2>
1256template <
size_t N,
int kPow2>
1270template <
size_t N,
int kPow2>
1272 const svfloat16_t
v) {
1275 const svfloat16_t vv = detail::ZipLowerSame(
v,
v);
1279template <
size_t N,
int kPow2>
1281 const svfloat32_t
v) {
1282 const svfloat32_t vv = detail::ZipLowerSame(
v,
v);
1286template <
size_t N,
int kPow2>
1288 const svint32_t
v) {
1289 const svint32_t vv = detail::ZipLowerSame(
v,
v);
1296#undef HWY_SVE_PROMOTE_TO
1298template <
size_t N,
int kPow2>
1312template <
typename TN,
class VU>
1314 return detail::MinN(
v,
static_cast<TFromV<VU>>(LimitsMax<TN>()));
1318template <
typename TN,
class VI>
1320 return detail::MinN(detail::MaxN(
v, LimitsMin<TN>()), LimitsMax<TN>());
1325template <
size_t N,
int kPow2>
1329 using TN =
TFromD<
decltype(dn)>;
1331 const svuint16_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1333 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1334 return svuzp1_u8(vn, vn);
1337template <
size_t N,
int kPow2>
1341 using TN =
TFromD<
decltype(dn)>;
1343 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1345 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1346 return svuzp1_u16(vn, vn);
1349template <
size_t N,
int kPow2>
1354 using TN =
TFromD<
decltype(dn)>;
1356 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1358 const svuint16_t cast16 =
BitCast(d2, detail::SaturateU<TN>(clamped));
1359 const svuint8_t x2 =
BitCast(dn, svuzp1_u16(cast16, cast16));
1360 return svuzp1_u8(x2, x2);
1368 const svuint16_t cast16 =
BitCast(du16,
v);
1369 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1370 const svuint8_t cast8 =
BitCast(du8, x2);
1371 return svuzp1_u8(cast8, cast8);
1376template <
size_t N,
int kPow2>
1378 const svuint64_t
v) {
1381 const svuint8_t v2 = svuzp1_u8(v1, v1);
1382 const svuint8_t v3 = svuzp1_u8(v2, v2);
1383 return svuzp1_u8(v3, v3);
1386template <
size_t N,
int kPow2>
1388 const svuint64_t
v) {
1391 const svuint16_t v2 = svuzp1_u16(v1, v1);
1392 return svuzp1_u16(v2, v2);
1395template <
size_t N,
int kPow2>
1397 const svuint64_t
v) {
1400 return svuzp1_u32(v1, v1);
1403template <
size_t N,
int kPow2>
1405 const svuint32_t
v) {
1408 const svuint8_t v2 = svuzp1_u8(v1, v1);
1409 return svuzp1_u8(v2, v2);
1412template <
size_t N,
int kPow2>
1414 const svuint32_t
v) {
1417 return svuzp1_u16(v1, v1);
1420template <
size_t N,
int kPow2>
1422 const svuint16_t
v) {
1425 return svuzp1_u8(v1, v1);
1430template <
size_t N,
int kPow2>
1432#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1433 const svint8_t vn =
BitCast(dn, svqxtnb_s16(
v));
1435 using TN =
TFromD<
decltype(dn)>;
1436 const svint8_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1438 return svuzp1_s8(vn, vn);
1441template <
size_t N,
int kPow2>
1443#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1444 const svint16_t vn =
BitCast(dn, svqxtnb_s32(
v));
1446 using TN =
TFromD<
decltype(dn)>;
1447 const svint16_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1449 return svuzp1_s16(vn, vn);
1452template <
size_t N,
int kPow2>
1455#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1456 const svint16_t cast16 =
BitCast(d2, svqxtnb_s16(svqxtnb_s32(
v)));
1458 using TN =
TFromD<
decltype(dn)>;
1459 const svint16_t cast16 =
BitCast(d2, detail::SaturateI<TN>(
v));
1461 const svint8_t v2 =
BitCast(dn, svuzp1_s16(cast16, cast16));
1462 return BitCast(dn, svuzp1_s8(v2, v2));
1471#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1472 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1473 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1474 return sv##OP##_##CHAR##BITS(lo, hi); \
1478#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1482#undef HWY_SVE_CONCAT_EVERY_SECOND
1486#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1487 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1488 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1489 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1492#undef HWY_SVE_SPLICE
1500 return detail::ConcatOddFull(hi, lo);
1502 const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
1503 const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
1504 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1512 return detail::ConcatEvenFull(hi, lo);
1514 const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
1515 const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
1516 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1522template <
size_t N,
int kPow2>
1524 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(
d),
v);
1525 return detail::ConcatEvenFull(in_even,
1529template <
size_t N,
int kPow2>
1532 return detail::ConcatOddFull(in_even, in_even);
1535template <
size_t N,
int kPow2>
1537 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(
d),
v);
1538 return detail::ConcatEvenFull(in_even,
1542template <
size_t N,
int kPow2>
1544 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(
d),
v);
1545 return detail::ConcatEvenFull(in_even,
1551#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1553 template <size_t N, int kPow2> \
1554 HWY_API HWY_SVE_V(BASE, BITS) \
1555 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(int, BITS) v) { \
1556 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1559 template <size_t N, int kPow2> \
1560 HWY_API HWY_SVE_V(BASE, BITS) \
1561 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(uint, BITS) v) { \
1562 return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1565 template <size_t N, int kPow2> \
1566 HWY_API HWY_SVE_V(int, BITS) \
1567 NAME(HWY_SVE_D(int, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
1568 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1573#undef HWY_SVE_CONVERT
1576template <
class VF,
class DI = RebindToSigned<DFromV<VF>>>
1584#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1585 template <size_t N, int kPow2> \
1586 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
1587 HWY_SVE_T(BASE, BITS) first) { \
1588 return sv##OP##_##CHAR##BITS(first, 1); \
1594template <
class D, HWY_IF_FLOAT_D(D)>
1602template <
class D,
class V>
1604 static_assert(IsSame<TFromD<D>,
TFromV<V>>(),
"D/V mismatch");
1605#if HWY_TARGET == HWY_SVE2_128
1607 return detail::ZipLowerSame(a, b);
1611 const auto a64 =
BitCast(d64, a);
1612 const auto b64 =
BitCast(d64, b);
1613 const auto a_blocks = detail::ConcatEvenFull(a64, a64);
1614 const auto b_blocks = detail::ConcatEvenFull(b64, b64);
1628#if HWY_TARGET == HWY_SVE2_128
1636template <
class D,
class V = VFromD<D>,
1637 hwy::EnableIf<detail::IsFull(D())>* =
nullptr>
1639#if HWY_TARGET == HWY_SVE2_128
1641 return detail::ZipUpperSame(a, b);
1645 const auto a64 =
BitCast(d64, a);
1646 const auto b64 =
BitCast(d64, b);
1647 const auto a_blocks = detail::ConcatOddFull(a64, a64);
1648 const auto b_blocks = detail::ConcatOddFull(b64, b64);
1654template <
class D,
class V = VFromD<D>,
1655 hwy::EnableIf<!detail::IsFull(D())>* =
nullptr>
1658 if (
Lanes(
d) *
sizeof(TFromD<D>) < 16) {
1659 const Half<
decltype(
d)> d2;
1669#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
1670template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1674 return svptrue_pat_b8(SV_VL16);
1676 return svptrue_pat_b8(SV_VL8);
1678 return svptrue_pat_b8(SV_VL4);
1680 return svptrue_pat_b8(SV_VL2);
1682 return svptrue_pat_b8(SV_VL1);
1685template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1689 return svptrue_pat_b16(SV_VL8);
1691 return svptrue_pat_b16(SV_VL4);
1693 return svptrue_pat_b16(SV_VL2);
1695 return svptrue_pat_b16(SV_VL1);
1698template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1702 return svptrue_pat_b32(SV_VL4);
1704 return svptrue_pat_b32(SV_VL2);
1706 return svptrue_pat_b32(SV_VL1);
1709template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1713 return svptrue_pat_b64(SV_VL2);
1715 return svptrue_pat_b64(SV_VL1);
1719#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
1720template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1724 return svptrue_pat_b8(SV_VL8);
1726 return svptrue_pat_b8(SV_VL4);
1728 return svptrue_pat_b8(SV_VL2);
1732 return svptrue_pat_b8(SV_VL1);
1735template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1739 return svptrue_pat_b16(SV_VL4);
1741 return svptrue_pat_b16(SV_VL2);
1745 return svptrue_pat_b16(SV_VL1);
1748template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1750 return svptrue_pat_b32(
Lanes(
d) == 4 ? SV_VL2 : SV_VL1);
1752template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1754 return svptrue_pat_b64(SV_VL1);
1757#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
1777#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1778 template <size_t kIndex> \
1779 HWY_API HWY_SVE_V(BASE, BITS) \
1780 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1781 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1789template <
class D,
class V>
1795template <
class D,
class V>
1798#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1799 return detail::ConcatEvenBlocks(hi, lo);
1801#if HWY_TARGET == HWY_SVE2_128
1803 const auto lo64 =
BitCast(du64, lo);
1811template <
class D,
class V>
1813#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
1815 return detail::Ext<
Lanes(
d) / 2>(hi, lo);
1822template <
class D,
class V>
1825#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
1826 return detail::ConcatOddBlocks(hi, lo);
1828#if HWY_TARGET == HWY_SVE2_128
1830 const auto lo64 =
BitCast(du64, lo);
1835 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1840template <
class D,
class V2>
1846template <
class D,
class V>
1853template <
class D2,
class V>
1863template <
class DH,
class V>
1865 const Twice<
decltype(dh)>
d;
1869#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
1873 return BitCast(
d, detail::Splice(vu, vu, mask));
1881#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
1882 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1884 using T = HWY_SVE_T(BASE, BITS); \
1885 using TU = MakeUnsigned<T>; \
1886 constexpr uint64_t kMask = LimitsMax<TU>(); \
1887 return static_cast<T>(static_cast<TU>( \
1888 static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
1891#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1892 HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1893 return sv##OP##_##CHAR##BITS(pg, v); \
1905#undef HWY_SVE_REDUCE
1906#undef HWY_SVE_REDUCE_ADD
1909template <
class D,
class V>
1914template <
class D,
class V>
1919template <
class D,
class V>
1930#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1931 HWY_INLINE HWY_SVE_T(BASE, BITS) \
1932 NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1933 return sv##OP##_##CHAR##BITS(mask, v); \
1937#undef HWY_SVE_GET_LANE
1955 const auto is_i = detail::EqN(
Iota(
d, 0),
static_cast<TFromV<V>>(i));
1967 return detail::InterleaveEven(
v,
v);
1978 return detail::InterleaveOdd(
v,
v);
1983#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
1985#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
1986 HWY_API HWY_SVE_V(BASE, BITS) \
1987 NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
1988 return sv##OP##_##CHAR##BITS(even, odd, 0); \
1992#undef HWY_SVE_ODD_EVEN
1994template <
class V, HWY_IF_FLOAT_V(V)>
2005 const auto odd_in_even = detail::Ext<1>(odd, odd);
2006 return detail::InterleaveEven(even, odd_in_even);
2015#if HWY_TARGET == HWY_SVE_256
2017#elif HWY_TARGET == HWY_SVE2_128
2023 using TU =
TFromD<
decltype(du)>;
2024 constexpr size_t kShift =
CeilLog2(16 /
sizeof(TU));
2025 const auto idx_block = ShiftRight<kShift>(
Iota(du, 0));
2026 const auto lsb = detail::AndN(idx_block,
static_cast<TU
>(1));
2027 const svbool_t is_even = detail::EqN(lsb,
static_cast<TU
>(0));
2034template <
class D,
class VI>
2037 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index/lane size mismatch");
2039 const auto indices =
BitCast(du, vec);
2040#if HWY_IS_DEBUG_BUILD
2048template <
class D,
typename TI>
2050 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
2055#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
2056 HWY_API HWY_SVE_V(BASE, BITS) \
2057 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
2058 return sv##OP##_##CHAR##BITS(v, idx); \
2068template <
typename T,
size_t N,
int kPow2>
2079#if HWY_TARGET == HWY_SVE_256
2081#elif HWY_TARGET == HWY_SVE2_128
2086 constexpr auto kLanesPerBlock =
2088 const VFromD<
decltype(du)> idx = detail::XorN(
Iota(du, 0), kLanesPerBlock);
2097#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
2098 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2099 return sv##OP##_##CHAR##BITS(v); \
2103#undef HWY_SVE_REVERSE
2107template <
class D,
class V>
2110 const auto reversed = detail::ReverseFull(
v);
2117 const svbool_t all_true = detail::AllPTrue(dfull);
2119 const svbool_t mask =
2120 svnot_b_z(all_true,
FirstN(dfull, all_lanes -
Lanes(
d)));
2121 return detail::Splice(reversed, reversed, mask);
2126template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2133template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2140template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2142#if HWY_TARGET == HWY_SVE2_128
2144 return detail::Ext<1>(
v,
v);
2148 const auto odd_in_even = detail::Ext<1>(
v,
v);
2149 return detail::InterleaveEven(odd_in_even,
v);
2156 return detail::ReverseFull(
v);
2160 const auto idx = detail::XorN(
Iota(du, 0), 3);
2168 const auto idx = detail::XorN(
Iota(du, 0), 7);
2174template <
typename T>
2175struct CompressIsPartition {
2176#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2179 enum {
value = (
sizeof(T) == 8) };
2185#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
2186 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
2187 return sv##OP##_##CHAR##BITS(mask, v); \
2190#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
2196#undef HWY_SVE_COMPRESS
2198#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2199template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2207 const svuint64_t bits = Shl(
Set(du64, 1),
Iota(du64, 2));
2208 const size_t offset = detail::SumOfLanesM(mask, bits);
2211 alignas(16)
static constexpr uint64_t table[4 * 16] = {
2213 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
2214 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
2215 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
2220#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2221template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2228 const svbool_t maskLL = svzip1_b64(mask, mask);
2229 return detail::Splice(
v,
v,
AndNot(maskLL, mask));
2234template <
class V, HWY_IF_LANE_SIZE_V(V, 2)>
2236 static_assert(!IsSame<V, svfloat16_t>(),
"Must use overload");
2237 const DFromV<V> d16;
2243 const svbool_t mask32L = svunpklo_b(mask16);
2244 const svbool_t mask32H = svunpkhi_b(mask16);
2246 const auto compressedL =
Compress(v32L, mask32L);
2247 const auto compressedH =
Compress(v32H, mask32H);
2250 const V evenL =
BitCast(d16, compressedL);
2251 const V evenH =
BitCast(d16, compressedH);
2252 const V v16L = detail::ConcatEvenFull(evenL, evenL);
2253 const V v16H = detail::ConcatEvenFull(evenH, evenH);
2258 const size_t countL = detail::CountTrueFull(dw, mask32L);
2259 const auto compressed_maskL =
FirstN(d16, countL);
2260 return detail::Splice(v16H, v16L, compressed_maskL);
2273template <
class V,
typename T = TFromV<V>, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
2278template <
class V, HWY_IF_LANE_SIZE_V(V, 8)>
2280#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
2286 const svbool_t maskLL = svzip1_b64(mask, mask);
2287 return detail::Splice(
v,
v,
AndNot(mask, maskLL));
2289#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2296 const svuint64_t bits = Shl(
Set(du64, 1),
Iota(du64, 2));
2297 const size_t offset = detail::SumOfLanesM(mask, bits);
2300 alignas(16)
static constexpr uint64_t table[4 * 16] = {
2302 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
2303 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
2304 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
2313#if HWY_TARGET == HWY_SVE2_128
2317#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2319 CopyBytes<4>(&mask, &bits);
2321 const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
2323 alignas(16)
static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
2324 0, 1, 2, 3, 0, 1, 2, 3};
2333template <
class V,
class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2341template <
class V,
class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2345 const svbool_t store_mask =
FirstN(
d, count);
2356#if HWY_TARGET != HWY_SVE2_128
2361template <
class D,
class V>
2364 return detail::AndNotN(
static_cast<T
>(
LanesPerBlock(
d) - 1), iota0);
2367template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 1)>
2371 const svuint8_t idx_mod =
2372 svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2373 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2374 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
2375 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
2376 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
2377 15 % kLanesPerBlock);
2378 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2380template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2384 const svuint16_t idx_mod =
2385 svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2386 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
2387 6 % kLanesPerBlock, 7 % kLanesPerBlock);
2388 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2390template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2394 const svuint32_t idx_mod =
2395 svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
2396 3 % kLanesPerBlock);
2397 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2399template <
size_t kLanes,
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2403 const svuint64_t idx_mod =
2404 svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
2405 return detail::LtN(
BitCast(du, idx_mod), kLanes);
2411template <
size_t kBytes,
class D,
class V = VFromD<D>>
2414 const auto hi8 =
BitCast(d8, hi);
2415 const auto lo8 =
BitCast(d8, lo);
2416#if HWY_TARGET == HWY_SVE2_128
2417 return BitCast(
d, detail::Ext<kBytes>(hi8, lo8));
2419 const auto hi_up = detail::Splice(hi8, hi8,
FirstN(d8, 16 - kBytes));
2420 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
2430 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
2439 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
2440 const svuint8_t v8 =
BitCast(d8,
v);
2441 return BitCast(
d, CombineShiftRightBytes<12>(d8, v8, v8));
2449 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
2450 const svuint8_t v8 =
BitCast(d8,
v);
2451 return BitCast(
d, CombineShiftRightBytes<4>(d8, v8, v8));
2459 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
2460 const svuint8_t v8 =
BitCast(d8,
v);
2461 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
2469 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 8,
"Defined for 64-bit types");
2470 const svuint8_t v8 =
BitCast(d8,
v);
2471 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
2481template <
class D,
class V = VFromD<D>>
2483#if HWY_TARGET == HWY_SVE_256
2489#elif HWY_TARGET == HWY_SVE2_128
2499template <
class V,
class VI>
2503#if HWY_TARGET == HWY_SVE2_128
2507 const auto idx8 = Add(
BitCast(du8, idx), offsets128);
2512template <
class V,
class VI>
2518 auto idx8 =
BitCast(di8, idx);
2519 const auto msb = detail::LtN(idx8, 0);
2527#if HWY_TARGET == HWY_SVE2_128
2529#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \
2530 template <int kLane> \
2531 HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
2532 return sv##OP##_##CHAR##BITS(v, kLane); \
2536#undef HWY_SVE_BROADCAST
2540template <
int kLane,
class V>
2545 static_assert(0 <= kLane && kLane < kLanesPerBlock,
"Invalid lane");
2546#if HWY_TARGET == HWY_SVE2_128
2547 return detail::BroadcastLane<kLane>(
v);
2551 idx = detail::AddN(idx, kLane);
2559template <
size_t kLanes,
class D,
class V = VFromD<D>>
2561 const auto zero =
Zero(
d);
2562 const auto shifted = detail::Splice(
v, zero,
FirstN(
d, kLanes));
2563#if HWY_TARGET == HWY_SVE2_128
2567 return IfThenElse(detail::FirstNPerBlock<kLanes>(
d), zero, shifted);
2571template <
size_t kLanes,
class V>
2573 return ShiftLeftLanes<kLanes>(
DFromV<V>(),
v);
2577template <
size_t kLanes,
class D,
class V = VFromD<D>>
2584#if HWY_TARGET == HWY_SVE2_128
2585 return detail::Ext<kLanes>(
Zero(
d),
v);
2587 const auto shifted = detail::Ext<kLanes>(
v,
v);
2597template <
int kBytes,
class D,
class V = VFromD<D>>
2603template <
int kBytes,
class V>
2605 return ShiftLeftBytes<kBytes>(
DFromV<V>(),
v);
2609template <
int kBytes,
class D,
class V = VFromD<D>>
2617template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2623template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2629template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2639template <
size_t N,
int kPow2>
2641 const svuint16_t
v) {
2642 return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0),
v));
2647template <
size_t N,
int kPow2>
2649 svfloat32_t a, svfloat32_t b) {
2651 const Repartition<uint32_t,
decltype(dbf16)> du32;
2652 const svuint32_t b_in_even = ShiftRight<16>(
BitCast(du32, b));
2656template <
size_t N,
int kPow2>
2659#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2661 const svint16_t a_in_even = svqxtnb_s32(a);
2662 return svqxtnt_s32(a_in_even, b);
2664 const Half<
decltype(d16)> dh;
2665 const svint16_t a16 =
BitCast(dh, detail::SaturateI<int16_t>(a));
2666 const svint16_t b16 =
BitCast(dh, detail::SaturateI<int16_t>(b));
2667 return detail::InterleaveEven(a16, b16);
2686 static_assert(IsSigned<TFromV<V>>(),
"Only works for signed/float");
2696#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2702 return ShiftRight<1>(detail::AddN(Add(a, b), 1));
2709template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
2712 const svuint8_t iota =
Iota(du, 0);
2715 const svuint8_t bytes =
BitCast(du, svld1ub_u64(detail::PTrue(
d), bits));
2717 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
2719 const svuint8_t bit =
2720 svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
2724template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2727 const RebindToUnsigned<D> du;
2728 const Repartition<uint8_t, D> du8;
2731 const svuint8_t bytes = svld1(
FirstN(du8, (
Lanes(du) + 7) / 8), bits);
2734 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(
Iota(du8, 0)));
2736 const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
2740template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2743 const RebindToUnsigned<D> du;
2744 const Repartition<uint8_t, D> du8;
2748 const svuint8_t bytes = svld1(
FirstN(du8, 8), bits);
2751 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(
Iota(du8, 0)));
2754 const svuint32_t bit = Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
2759template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2762 const RebindToUnsigned<D> du;
2767 CopyBytes<4>(bits, &mask_bits);
2768 const auto vbits =
Set(du, mask_bits);
2771 const svuint64_t bit =
Shl(
Set(du, 1),
Iota(du, 0));
2781template <
class T, HWY_IF_LANE_SIZE(T, 1)>
2783 return svdup_n_u8_z(m, 1);
2785template <
class T, HWY_IF_LANE_SIZE(T, 2)>
2788 const svuint8_t b16 =
BitCast(d8, svdup_n_u16_z(m, 1));
2789 return detail::ConcatEvenFull(b16, b16);
2791template <
class T, HWY_IF_LANE_SIZE(T, 4)>
2795template <
class T, HWY_IF_LANE_SIZE(T, 8)>
2797 const ScalableTag<uint32_t> d32;
2798 const svuint32_t b64 =
BitCast(d32, svdup_n_u64_z(m, 1));
2799 return U8FromU32(detail::ConcatEvenFull(b64, b64));
2821 svuint64_t bits_in_u64 =
2824 const size_t num_bits =
Lanes(
d);
2825 const size_t num_bytes = (num_bits + 8 - 1) / 8;
2833 const int mask =
static_cast<int>((1ull << num_bits) - 1);
2834 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
2842template <
class V,
class D = DFromV<V>, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2848template <
class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
2856#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2858#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2859 HWY_API HWY_SVE_V(BASE, BITS) \
2860 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2861 return sv##OP##_##CHAR##BITS(a, b); \
2865#undef HWY_SVE_MUL_EVEN
2869template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2871#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2872 return BitCast(DW(), detail::MulEvenNative(a, b));
2874 const auto lo = Mul(a, b);
2875 const auto hi =
MulHigh(a, b);
2876 return BitCast(DW(), detail::InterleaveEven(lo, hi));
2881 const auto lo = Mul(a, b);
2882 const auto hi =
MulHigh(a, b);
2883 return detail::InterleaveEven(lo, hi);
2887 const auto lo = Mul(a, b);
2888 const auto hi =
MulHigh(a, b);
2889 return detail::InterleaveOdd(lo, hi);
2894template <
size_t N,
int kPow2>
2896 svuint16_t a, svuint16_t b,
2897 const svfloat32_t sum0,
2898 svfloat32_t& sum1) {
2903 using VU32 =
VFromD<
decltype(du32)>;
2904 const VU32 odd =
Set(du32, 0xFFFF0000u);
2905 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
2907 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
2913template <
size_t N,
int kPow2>
2915 svint16_t a, svint16_t b,
2916 const svint32_t sum0,
2918#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
2920 sum1 = svmlalt_s32(sum1, a, b);
2921 return svmlalb_s32(sum0, a, b);
2923 const svbool_t pg = detail::PTrue(d32);
2926 const svint32_t ae = svexth_s32_x(pg,
BitCast(d32, a));
2927 const svint32_t be = svexth_s32_x(pg,
BitCast(d32, b));
2928 const svint32_t ao = ShiftRight<16>(
BitCast(d32, a));
2929 const svint32_t bo = ShiftRight<16>(
BitCast(d32, b));
2930 sum1 = svmla_s32_x(pg, sum1, ao, bo);
2931 return svmla_s32_x(pg, sum0, ae, be);
2939 return Add(sum0, sum1);
2944#if defined(__ARM_FEATURE_SVE2_AES) || \
2945 ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \
2946 HWY_HAVE_RUNTIME_DISPATCH)
2949#ifdef HWY_NATIVE_AES
2950#undef HWY_NATIVE_AES
2952#define HWY_NATIVE_AES
2957 const svuint8_t zero = svdup_n_u8(0);
2958 return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2962 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2966 return svpmullb_pair(a, b);
2970 return svpmullt_pair(a, b);
2978#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \
2979 template <size_t N, int kPow2> \
2980 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
2981 return sv##OP##_b##BITS(m, m); \
2988#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
2991 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
2993 const svbool_t eqHx = Eq(a, b);
3008#if HWY_TARGET == HWY_SVE_256
3011 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3013 const svbool_t eqHx = Eq(a, b);
3014 const svbool_t ltHL = Lt(a, b);
3016 const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(
d, ltHL), ltHL);
3018 return detail::DupOddB(
d, ltHx);
3026 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3028 const svbool_t ltHL = Lt(a, b);
3029 return detail::DupOddB(
d, ltHL);
3034#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3039 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3045 const svuint64_t eqHH =
DupOdd(eqHL);
3046 const svuint64_t eqLL =
DupEven(eqHL);
3047 return And(eqLL, eqHH);
3052 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3058 const svuint64_t neHH =
DupOdd(neHL);
3059 const svuint64_t neLL =
DupEven(neHL);
3060 return Or(neLL, neHH);
3068#if HWY_TARGET == HWY_SVE_256
3071 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3073 const svbool_t eqHL = Eq(a, b);
3074 const svbool_t eqHH = detail::DupOddB(
d, eqHL);
3075 const svbool_t eqLL = detail::DupEvenB(
d, eqHL);
3076 return And(eqLL, eqHH);
3082#if HWY_TARGET == HWY_SVE_256
3085 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3087 const svbool_t neHL = Ne(a, b);
3088 const svbool_t neHH = detail::DupOddB(
d, neHL);
3089 const svbool_t neLL = detail::DupEvenB(
d, neHL);
3090 return Or(neLL, neHH);
3098 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3100 const svbool_t eqHL = Eq(a, b);
3101 return detail::DupOddB(
d, eqHL);
3106 static_assert(!IsSigned<TFromD<D>>() &&
sizeof(
TFromD<D>) == 8,
3108 const svbool_t neHL = Ne(a, b);
3109 return detail::DupOddB(
d, neHL);
3116#if HWY_TARGET == HWY_SVE_256
3125#if HWY_TARGET == HWY_SVE_256
3144#undef HWY_IF_FLOAT_V
3145#undef HWY_IF_LANE_SIZE_V
3146#undef HWY_SVE_ALL_PTRUE
3148#undef HWY_SVE_FOREACH
3149#undef HWY_SVE_FOREACH_F
3150#undef HWY_SVE_FOREACH_F16
3151#undef HWY_SVE_FOREACH_F32
3152#undef HWY_SVE_FOREACH_F64
3153#undef HWY_SVE_FOREACH_I
3154#undef HWY_SVE_FOREACH_I08
3155#undef HWY_SVE_FOREACH_I16
3156#undef HWY_SVE_FOREACH_I32
3157#undef HWY_SVE_FOREACH_I64
3158#undef HWY_SVE_FOREACH_IF
3159#undef HWY_SVE_FOREACH_U
3160#undef HWY_SVE_FOREACH_U08
3161#undef HWY_SVE_FOREACH_U16
3162#undef HWY_SVE_FOREACH_U32
3163#undef HWY_SVE_FOREACH_U64
3164#undef HWY_SVE_FOREACH_UI
3165#undef HWY_SVE_FOREACH_UI08
3166#undef HWY_SVE_FOREACH_UI16
3167#undef HWY_SVE_FOREACH_UI32
3168#undef HWY_SVE_FOREACH_UI64
3169#undef HWY_SVE_FOREACH_UIF3264
3171#undef HWY_SVE_RETV_ARGPV
3172#undef HWY_SVE_RETV_ARGPVN
3173#undef HWY_SVE_RETV_ARGPVV
3174#undef HWY_SVE_RETV_ARGV
3175#undef HWY_SVE_RETV_ARGVN
3176#undef HWY_SVE_RETV_ARGVV
3177#undef HWY_SVE_RETV_ARGVVV
3179#undef HWY_SVE_UNDEFINED
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:103
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:59
#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1121
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:71
#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2978
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:772
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1486
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1073
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1471
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:55
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1584
#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1155
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:365
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:710
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:354
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:126
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:266
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2097
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:994
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:559
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1088
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:310
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:986
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:184
#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1985
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:118
#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2529
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:111
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:155
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1881
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:63
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:340
#define HWY_SVE_PTRUE(BITS)
Definition arm_sve-inl.h:213
#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1137
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1777
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1203
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1188
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:784
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1097
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:89
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:161
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1003
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2055
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:178
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:633
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:95
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1222
#define HWY_SVE_IS_POW2
Definition arm_sve-inl.h:30
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1011
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2185
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:978
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:850
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:515
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:83
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:138
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1551
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:822
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:449
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:122
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:56
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:77
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:846
#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:914
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:280
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:2858
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1174
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:107
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1063
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:601
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1891
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition arm_sve-inl.h:99
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:173
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:151
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:459
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:1930
#define HWY_RESTRICT
Definition base.h:64
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_TARGET
Definition detect_targets.h:380
#define HWY_SVE_256
Definition detect_targets.h:81
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition arm_sve-inl.h:2782
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition arm_sve-inl.h:196
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition arm_sve-inl.h:2803
svbool_t MaskLowerHalf(D d)
Definition arm_sve-inl.h:1671
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2078
svbool_t MakeMask(D d)
Definition arm_sve-inl.h:300
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:2069
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
VI SaturateI(VI v)
Definition arm_sve-inl.h:1319
HWY_API svbool_t PFalse()
Definition arm_sve-inl.h:293
svbool_t MaskUpperHalf(D d)
Definition arm_sve-inl.h:1765
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
VU SaturateU(VU v)
Definition arm_sve-inl.h:1313
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition arm_sve-inl.h:1299
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3051
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:2990
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:123
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:115
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2084
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3038
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition x86_512-inl.h:1613
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:173
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6549
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:885
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API constexpr bool IsSame()
Definition base.h:396
constexpr size_t CeilLog2(TI x)
Definition base.h:899
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
@ value
Definition arm_neon-inl.h:5730
Definition arm_sve-inl.h:40
Definition ops/shared-inl.h:52
uint16_t bits
Definition base.h:297