34template <
typename T,
size_t N = 16 /
sizeof(T)>
37 static constexpr size_t kPrivateN =
N;
44 return *
this = (*
this * other);
47 return *
this = (*
this / other);
50 return *
this = (*
this + other);
53 return *
this = (*
this - other);
56 return *
this = (*
this & other);
59 return *
this = (*
this | other);
62 return *
this = (*
this ^ other);
70 T raw[16 /
sizeof(T)] = {};
74template <
typename T,
size_t N = 16 /
sizeof(T)>
78 return b ?
static_cast<Raw>(
~Raw{0}) : 0;
82 Raw bits[16 /
sizeof(T)] = {};
86using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
89using TFromV =
typename V::PrivateT;
93template <
typename T,
size_t N,
typename FromT,
size_t FromN>
102template <
typename T,
size_t N>
105 ZeroBytes<sizeof(T) * N>(
v.raw);
112template <
typename T,
size_t N,
typename T2>
115 for (
size_t i = 0; i <
N; ++i) {
116 v.
raw[i] =
static_cast<T
>(t);
121template <
typename T,
size_t N>
126template <
typename T,
size_t N,
typename T2>
129 for (
size_t i = 0; i <
N; ++i) {
139template <
typename T,
size_t N>
143 using TU =
TFromD<
decltype(du)>;
145 for (
size_t i = 0; i <
N; ++i) {
146 vu.raw[i] =
static_cast<TU
>(~vu.raw[i]);
152template <
typename T,
size_t N>
158 for (
size_t i = 0; i <
N; ++i) {
159 au.raw[i] &= bu.raw[i];
163template <
typename T,
size_t N>
169template <
typename T,
size_t N>
175template <
typename T,
size_t N>
181 for (
size_t i = 0; i <
N; ++i) {
182 au.raw[i] |= bu.raw[i];
186template <
typename T,
size_t N>
192template <
typename T,
size_t N>
198 for (
size_t i = 0; i <
N; ++i) {
199 au.raw[i] ^= bu.raw[i];
203template <
typename T,
size_t N>
210template <
typename T,
size_t N>
211HWY_API Vec128<T, N>
Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
212 return Xor(x1,
Xor(x2, x3));
217template <
typename T,
size_t N>
218HWY_API Vec128<T, N>
Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
219 return Or(o1,
Or(o2, o3));
223template <
typename T,
size_t N>
224HWY_API Vec128<T, N>
OrAnd(
const Vec128<T, N> o,
const Vec128<T, N> a1,
225 const Vec128<T, N> a2) {
226 return Or(o,
And(a1, a2));
230template <
typename T,
size_t N>
237template <
typename T,
size_t N>
239 const Vec128<T, N> sign) {
240 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
241 const auto msb =
SignBit(Simd<T, N, 0>());
245template <
typename T,
size_t N>
247 const Vec128<T, N> sign) {
248 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
253template <
typename T,
size_t N>
256 for (
size_t i = 0; i <
N; ++i) {
257 v.
raw[i] =
v.raw[i] < 0 ? T(-1) : T(0);
264template <
typename TFrom,
typename TTo,
size_t N>
266 Mask128<TFrom, N> mask) {
273template <
typename T,
size_t N>
280template <
typename T,
size_t N>
287template <
typename T,
size_t N>
292template <
typename T,
size_t N>
295 for (
size_t i = 0; i <
N; ++i) {
302template <
typename T,
size_t N>
308template <
typename T,
size_t N>
310 const Vec128<T, N> yes) {
314template <
typename T,
size_t N>
316 const Vec128<T, N> no) {
320template <
typename T,
size_t N>
323 for (
size_t i = 0; i <
N; ++i) {
324 v.raw[i] =
v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
329template <
typename T,
size_t N>
336template <
typename T,
size_t N>
337HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
341template <
typename T,
size_t N>
342HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
343 const Simd<T, N, 0>
d;
347template <
typename T,
size_t N>
348HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
349 const Simd<T, N, 0>
d;
353template <
typename T,
size_t N>
354HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
355 const Simd<T, N, 0>
d;
359template <
typename T,
size_t N>
360HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
361 const Simd<T, N, 0>
d;
365template <
typename T,
size_t N>
367 const Simd<T, N, 0>
d;
375template <
int kBits,
typename T,
size_t N>
377 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
378 for (
size_t i = 0; i <
N; ++i) {
380 v.
raw[i] =
static_cast<T
>(shifted);
385template <
int kBits,
typename T,
size_t N>
387 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
388#if __cplusplus >= 202002L
391 for (
size_t i = 0; i <
N; ++i) {
392 v.
raw[i] =
static_cast<T
>(
v.raw[i] >> kBits);
399 for (
size_t i = 0; i <
N; ++i) {
400 const TU shifted =
static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> kBits);
401 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
402 const size_t sign_shift =
403 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - kBits);
404 const TU upper =
static_cast<TU
>(sign << sign_shift);
405 v.raw[i] =
static_cast<T
>(shifted | upper);
408 for (
size_t i = 0; i <
N; ++i) {
409 v.raw[i] =
static_cast<T
>(
v.raw[i] >> kBits);
423 template <
typename T,
size_t N>
425 return Or(ShiftRight<kBits>(
v),
ShiftLeft<
sizeof(T) * 8 - kBits>(
v));
431 template <
typename T,
size_t N>
439template <
int kBits,
typename T,
size_t N>
441 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
447template <
typename T,
size_t N>
449 for (
size_t i = 0; i <
N; ++i) {
451 v.raw[i] =
static_cast<T
>(shifted);
456template <
typename T,
size_t N>
458#if __cplusplus >= 202002L
461 for (
size_t i = 0; i <
N; ++i) {
462 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits);
469 for (
size_t i = 0; i <
N; ++i) {
470 const TU shifted =
static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> bits);
471 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
472 const size_t sign_shift =
473 static_cast<size_t>(
static_cast<int>(
sizeof(TU)) * 8 - 1 - bits);
474 const TU upper =
static_cast<TU
>(sign << sign_shift);
475 v.raw[i] =
static_cast<T
>(shifted | upper);
478 for (
size_t i = 0; i <
N; ++i) {
479 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits);
488template <
typename T,
size_t N>
490 for (
size_t i = 0; i <
N; ++i) {
493 v.raw[i] =
static_cast<T
>(shifted);
498template <
typename T,
size_t N>
500#if __cplusplus >= 202002L
503 for (
size_t i = 0; i <
N; ++i) {
504 v.
raw[i] =
static_cast<T
>(
v.raw[i] >> bits.
raw[i]);
511 for (
size_t i = 0; i <
N; ++i) {
513 static_cast<TU
>(
static_cast<TU
>(
v.raw[i]) >> bits.
raw[i]);
514 const TU sign =
v.raw[i] < 0 ?
static_cast<TU
>(~TU{0}) : 0;
515 const size_t sign_shift =
static_cast<size_t>(
516 static_cast<int>(
sizeof(TU)) * 8 - 1 - bits.
raw[i]);
517 const TU upper =
static_cast<TU
>(sign << sign_shift);
518 v.raw[i] =
static_cast<T
>(shifted | upper);
521 for (
size_t i = 0; i <
N; ++i) {
522 v.raw[i] =
static_cast<T
>(
v.raw[i] >> bits.
raw[i]);
534template <
typename T,
size_t N>
537 for (
size_t i = 0; i <
N; ++i) {
538 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
539 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
540 a.
raw[i] =
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0)));
544template <
typename T,
size_t N>
547 for (
size_t i = 0; i <
N; ++i) {
548 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw[i]);
549 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw[i]);
550 a.
raw[i] =
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0)));
555template <
typename T,
size_t N>
558 for (
size_t i = 0; i <
N; ++i) {
564template <
typename T,
size_t N>
567 for (
size_t i = 0; i <
N; ++i) {
575template <
typename T,
size_t N>
579template <
typename T,
size_t N>
588 Vec128<uint64_t, (
N + 7) / 8> sums;
589 for (
size_t i = 0; i <
N; ++i) {
590 sums.
raw[i / 8] +=
v.raw[i];
596template <
typename T,
size_t N>
598 for (
size_t i = 0; i <
N; ++i) {
599 a.
raw[i] =
static_cast<T
>(
601 hwy::HighestValue<T>()));
607template <
typename T,
size_t N>
609 for (
size_t i = 0; i <
N; ++i) {
610 a.
raw[i] =
static_cast<T
>(
612 hwy::HighestValue<T>()));
618template <
typename T,
size_t N>
620 static_assert(!IsSigned<T>(),
"Only for unsigned");
621 for (
size_t i = 0; i <
N; ++i) {
622 a.
raw[i] =
static_cast<T
>((a.
raw[i] + b.
raw[i] + 1) / 2);
632template <
typename T,
size_t N>
634 for (
size_t i = 0; i <
N; ++i) {
635 const T s = a.
raw[i];
636 const T min = hwy::LimitsMin<T>();
637 a.
raw[i] =
static_cast<T
>((s >= 0 || s == min) ? a.
raw[i] : -s);
642template <
typename T,
size_t N>
644 for (
size_t i = 0; i <
N; ++i) {
645 v.
raw[i] = std::abs(
v.raw[i]);
652template <
typename T,
size_t N>
662template <
typename T,
size_t N>
665 for (
size_t i = 0; i <
N; ++i) {
670template <
typename T,
size_t N>
673 for (
size_t i = 0; i <
N; ++i) {
679template <
typename T,
size_t N>
682 for (
size_t i = 0; i <
N; ++i) {
683 if (std::isnan(a.
raw[i])) {
685 }
else if (std::isnan(b.
raw[i])) {
693template <
typename T,
size_t N>
696 for (
size_t i = 0; i <
N; ++i) {
697 if (std::isnan(a.
raw[i])) {
699 }
else if (std::isnan(b.
raw[i])) {
710template <
typename T,
size_t N>
715template <
typename T,
size_t N>
725template <
typename T,
size_t N>
730template <
typename T,
size_t N>
737template <
typename T,
size_t N>
747template <
typename T,
size_t N>
750 for (
size_t i = 0; i <
N; ++i) {
756template <
typename T,
size_t N>
759 for (
size_t i = 0; i <
N; ++i) {
760 a.
raw[i] =
static_cast<T
>(
static_cast<uint64_t
>(a.
raw[i]) *
761 static_cast<uint64_t
>(b.
raw[i]));
766template <
typename T,
size_t N>
769 for (
size_t i = 0; i <
N; ++i) {
770 a.
raw[i] =
static_cast<T
>(
static_cast<uint64_t
>(a.
raw[i]) *
771 static_cast<uint64_t
>(b.
raw[i]));
778template <
typename T,
size_t N>
783template <
typename T,
size_t N>
785 for (
size_t i = 0; i <
N; ++i) {
795 for (
size_t i = 0; i <
N; ++i) {
796 a.
raw[i] =
static_cast<int16_t
>((int32_t{a.
raw[i]} * b.
raw[i]) >> 16);
803 for (
size_t i = 0; i <
N; ++i) {
807 a.
raw[i] =
static_cast<uint16_t
>(
808 (
static_cast<uint32_t
>(a.
raw[i]) *
static_cast<uint32_t
>(b.
raw[i])) >>
817 for (
size_t i = 0; i <
N; ++i) {
818 a.
raw[i] =
static_cast<int16_t
>((2 * a.
raw[i] * b.
raw[i] + 32768) >> 16);
825HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
826 const Vec128<int32_t, N> b) {
827 Vec128<int64_t, (
N + 1) / 2> mul;
828 for (
size_t i = 0; i <
N; i += 2) {
829 const int64_t a64 = a.
raw[i];
830 mul.raw[i / 2] = a64 * b.raw[i];
836 const Vec128<uint32_t, N> b) {
837 Vec128<uint64_t, (
N + 1) / 2> mul;
838 for (
size_t i = 0; i <
N; i += 2) {
839 const uint64_t a64 = a.raw[i];
840 mul.raw[i / 2] = a64 * b.raw[i];
848 Vec128<int64_t, (
N + 1) / 2> mul;
849 for (
size_t i = 0; i <
N; i += 2) {
850 const int64_t a64 = a.
raw[i + 1];
851 mul.raw[i / 2] = a64 * b.
raw[i + 1];
858 Vec128<uint64_t, (
N + 1) / 2> mul;
859 for (
size_t i = 0; i <
N; i += 2) {
860 const uint64_t a64 = a.
raw[i + 1];
861 mul.raw[i / 2] = a64 * b.
raw[i + 1];
868 for (
size_t i = 0; i <
N; ++i) {
872 v.raw[i] = (std::abs(
v.raw[i]) == 0.0f) ? 0.0f : 1.0f /
v.raw[i];
884template <
typename T,
size_t N>
887 return mul * x + add;
890template <
typename T,
size_t N>
893 return add - mul * x;
896template <
typename T,
size_t N>
899 return mul * x - sub;
902template <
typename T,
size_t N>
905 return Neg(mul) * x - sub;
912 for (
size_t i = 0; i <
N; ++i) {
913 const float half =
v.
raw[i] * 0.5f;
917 bits = 0x5F3759DF - (bits >> 1);
920 v.raw[i] =
v.raw[i] * (1.5f - (half *
v.raw[i] *
v.raw[i]));
925template <
typename T,
size_t N>
927 for (
size_t i = 0; i <
N; ++i) {
928 v.
raw[i] = std::sqrt(
v.raw[i]);
935template <
typename T,
size_t N>
939 for (
size_t i = 0; i <
N; ++i) {
940 if (!(a.
raw[i] < MantissaEnd<T>())) {
943 const T bias =
v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
944 const TI rounded =
static_cast<TI
>(
v.raw[i] + bias);
946 v.raw[i] =
v.raw[i] < 0 ? T{-0} : T{0};
949 const T rounded_f =
static_cast<T
>(rounded);
951 if ((rounded & 1) && std::abs(rounded_f -
v.raw[i]) == T(0.5)) {
952 v.raw[i] =
static_cast<T
>(rounded - (
v.raw[i] < T(0) ? -1 : 1));
955 v.raw[i] = rounded_f;
966 const Vec128<float, N> abs =
Abs(
v);
967 Vec128<int32_t, N> ret;
968 for (
size_t i = 0; i <
N; ++i) {
969 const bool signbit = std::signbit(
v.raw[i]);
971 if (!(abs.raw[i] < MantissaEnd<T>())) {
973 if (!(abs.raw[i] <=
static_cast<T
>(LimitsMax<TI>()))) {
974 ret.raw[i] = signbit ? LimitsMin<TI>() :
LimitsMax<TI>();
977 ret.raw[i] =
static_cast<TI
>(
v.raw[i]);
980 const T bias =
v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
981 const TI rounded =
static_cast<TI
>(
v.raw[i] + bias);
986 const T rounded_f =
static_cast<T
>(rounded);
988 if ((rounded & 1) && std::abs(rounded_f -
v.raw[i]) == T(0.5)) {
989 ret.raw[i] = rounded - (signbit ? -1 : 1);
992 ret.raw[i] = rounded;
997template <
typename T,
size_t N>
1001 for (
size_t i = 0; i <
N; ++i) {
1002 if (!(abs.
raw[i] <= MantissaEnd<T>())) {
1005 const TI truncated =
static_cast<TI
>(
v.raw[i]);
1006 if (truncated == 0) {
1007 v.raw[i] =
v.raw[i] < 0 ? -T{0} : T{0};
1010 v.raw[i] =
static_cast<T
>(truncated);
1016template <
typename Float,
size_t N>
1018 constexpr int kMantissaBits = MantissaBits<Float>();
1020 const Bits kExponentMask = MaxExponentField<Float>();
1021 const Bits kMantissaMask = MantissaMask<Float>();
1022 const Bits kBias = kExponentMask / 2;
1024 for (
size_t i = 0; i <
N; ++i) {
1025 const bool positive =
v.raw[i] > Float(0.0);
1030 const int exponent =
1031 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1033 if (exponent >= kMantissaBits)
continue;
1036 v.raw[i] = positive ? Float{1} : Float{-0.0};
1040 const Bits mantissa_mask = kMantissaMask >> exponent;
1042 if ((bits & mantissa_mask) == 0)
continue;
1045 if (positive) bits += (kMantissaMask + 1) >> exponent;
1046 bits &= ~mantissa_mask;
1054template <
typename Float,
size_t N>
1056 constexpr int kMantissaBits = MantissaBits<Float>();
1058 const Bits kExponentMask = MaxExponentField<Float>();
1059 const Bits kMantissaMask = MantissaMask<Float>();
1060 const Bits kBias = kExponentMask / 2;
1062 for (
size_t i = 0; i <
N; ++i) {
1063 const bool negative =
v.raw[i] < Float(0.0);
1068 const int exponent =
1069 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1071 if (exponent >= kMantissaBits)
continue;
1074 v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1078 const Bits mantissa_mask = kMantissaMask >> exponent;
1080 if ((bits & mantissa_mask) == 0)
continue;
1083 if (negative) bits += (kMantissaMask + 1) >> exponent;
1084 bits &= ~mantissa_mask;
1093template <
typename T,
size_t N>
1096 for (
size_t i = 0; i <
N; ++i) {
1108template <
typename T,
size_t N>
1110 static_assert(IsFloat<T>(),
"Only for float");
1115 return RebindMask(
d, Eq(Add(vi, vi),
Set(di, hwy::MaxExponentTimes2<T>())));
1119template <
typename T,
size_t N>
1121 static_assert(IsFloat<T>(),
"Only for float");
1125 using VI =
VFromD<
decltype(di)>;
1126 using VU =
VFromD<
decltype(du)>;
1133 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
1138template <
typename T,
size_t N>
1141 for (
size_t i = 0; i <
N; ++i) {
1147template <
typename T,
size_t N>
1150 for (
size_t i = 0; i <
N; ++i) {
1156template <
typename T,
size_t N>
1158 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1159 return (
v & bit) == bit;
1162template <
typename T,
size_t N>
1165 for (
size_t i = 0; i <
N; ++i) {
1170template <
typename T,
size_t N>
1171HWY_API Mask128<T, N>
operator>(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1173 for (
size_t i = 0; i <
N; ++i) {
1179template <
typename T,
size_t N>
1182 for (
size_t i = 0; i <
N; ++i) {
1187template <
typename T,
size_t N>
1190 for (
size_t i = 0; i <
N; ++i) {
1211 const bool lt = a.
raw[1] < b.
raw[1];
1222 const bool eq = a.
raw[1] == b.
raw[1] && a.
raw[0] == b.
raw[0];
1230 const bool ne = a.
raw[1] != b.
raw[1] || a.
raw[0] != b.
raw[0];
1239 const bool eq = a.
raw[1] == b.
raw[1];
1248 const bool ne = a.
raw[1] != b.
raw[1];
1256template <
class D,
class V = VFromD<D>>
1261template <
class D,
class V = VFromD<D>>
1266template <
class D,
class V = VFromD<D>>
1271template <
class D,
class V = VFromD<D>>
1280template <
typename T,
size_t N>
1284 CopyBytes<sizeof(T) * N>(aligned,
v.raw);
1288template <
typename T,
size_t N>
1294template <
typename T,
size_t N>
1300template <
typename T,
size_t N>
1303 return Load(
d, aligned);
1308template <
typename T,
size_t N>
1311 CopyBytes<sizeof(T) * N>(
v.raw, aligned);
1314template <
typename T,
size_t N>
1319template <
typename T,
size_t N>
1322 for (
size_t i = 0; i <
N; ++i) {
1323 if (m.bits[i]) p[i] =
v.raw[i];
1332#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1333#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1335#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1338template <
typename T,
size_t N>
1341 alignas(16) T buf0[
N];
1342 alignas(16) T buf1[
N];
1343 for (
size_t i = 0; i <
N; ++i) {
1344 buf0[i] = *unaligned++;
1345 buf1[i] = *unaligned++;
1351template <
typename T,
size_t N>
1355 alignas(16) T buf0[
N];
1356 alignas(16) T buf1[
N];
1357 alignas(16) T buf2[
N];
1358 for (
size_t i = 0; i <
N; ++i) {
1359 buf0[i] = *unaligned++;
1360 buf1[i] = *unaligned++;
1361 buf2[i] = *unaligned++;
1368template <
typename T,
size_t N>
1372 alignas(16) T buf0[
N];
1373 alignas(16) T buf1[
N];
1374 alignas(16) T buf2[
N];
1375 alignas(16) T buf3[
N];
1376 for (
size_t i = 0; i <
N; ++i) {
1377 buf0[i] = *unaligned++;
1378 buf1[i] = *unaligned++;
1379 buf2[i] = *unaligned++;
1380 buf3[i] = *unaligned++;
1390template <
typename T,
size_t N>
1394 for (
size_t i = 0; i <
N; ++i) {
1395 *unaligned++ = v0.
raw[i];
1396 *unaligned++ = v1.
raw[i];
1400template <
typename T,
size_t N>
1404 for (
size_t i = 0; i <
N; ++i) {
1405 *unaligned++ = v0.
raw[i];
1406 *unaligned++ = v1.
raw[i];
1407 *unaligned++ = v2.
raw[i];
1411template <
typename T,
size_t N>
1416 for (
size_t i = 0; i <
N; ++i) {
1417 *unaligned++ = v0.
raw[i];
1418 *unaligned++ = v1.
raw[i];
1419 *unaligned++ = v2.
raw[i];
1420 *unaligned++ = v3.
raw[i];
1426template <
typename T,
size_t N>
1434template <
typename T,
size_t N,
typename Offset>
1437 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1438 for (
size_t i = 0; i <
N; ++i) {
1439 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw[i];
1440 CopyBytes<sizeof(T)>(&
v.raw[i], base8);
1444template <
typename T,
size_t N,
typename Index>
1447 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1448 for (
size_t i = 0; i <
N; ++i) {
1449 base[index.
raw[i]] =
v.raw[i];
1455template <
typename T,
size_t N,
typename Offset>
1458 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1460 for (
size_t i = 0; i <
N; ++i) {
1461 const uint8_t* base8 =
1462 reinterpret_cast<const uint8_t*
>(base) + offset.
raw[i];
1463 CopyBytes<sizeof(T)>(base8, &
v.raw[i]);
1468template <
typename T,
size_t N,
typename Index>
1471 const Vec128<Index, N> index) {
1472 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1474 for (
size_t i = 0; i <
N; ++i) {
1475 v.raw[i] = base[index.raw[i]];
1485template <
typename FromT,
typename ToT,
size_t N>
1488 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
1490 for (
size_t i = 0; i <
N; ++i) {
1492 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1503 for (
size_t i = 0; i <
N; ++i) {
1505 if (std::isinf(from.
raw[i]) ||
1511 ret.
raw[i] =
static_cast<float>(from.
raw[i]);
1519 for (
size_t i = 0; i <
N; ++i) {
1521 if (std::isinf(from.
raw[i]) ||
1522 std::fabs(from.
raw[i]) >
static_cast<double>(HighestValue<int32_t>())) {
1523 ret.
raw[i] = std::signbit(from.
raw[i]) ? LowestValue<int32_t>()
1524 : HighestValue<int32_t>();
1527 ret.
raw[i] =
static_cast<int32_t
>(from.
raw[i]);
1532template <
typename FromT,
typename ToT,
size_t N>
1535 static_assert(!IsFloat<FromT>(),
"FromT=double are handled above");
1536 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
1539 for (
size_t i = 0; i <
N; ++i) {
1543 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1550 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
1551 const Repartition<uint32_t,
decltype(dbf16)> du32;
1552 const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(
BitCast(du32, b));
1554 const Vec128<uint32_t, N> a_mask =
Set(du32, 0xFFFF0000);
1562 const int16_t min = LimitsMin<int16_t>();
1563 const int16_t max = LimitsMax<int16_t>();
1565 for (
size_t i = 0; i <
N; ++i) {
1568 for (
size_t i = 0; i <
N; ++i) {
1591 const Vec128<float16_t, N>
v) {
1592 Vec128<float, N> ret;
1593 for (
size_t i = 0; i <
N; ++i) {
1595 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
1596 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1597 const uint32_t mantissa = bits16 & 0x3FF;
1600 if (biased_exp == 0) {
1601 const float subnormal =
1602 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
1603 ret.raw[i] = sign ? -subnormal : subnormal;
1609 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1610 const uint32_t mantissa32 = mantissa << (23 - 10);
1611 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1619 const Vec128<bfloat16_t, N>
v) {
1620 Vec128<float, N> ret;
1621 for (
size_t i = 0; i <
N; ++i) {
1629 const Vec128<float, N>
v) {
1630 Vec128<float16_t, N> ret;
1631 for (
size_t i = 0; i <
N; ++i) {
1634 const uint32_t sign = bits32 >> 31;
1635 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1636 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1638 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
1642 ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
1646 uint32_t biased_exp16, mantissa16;
1651 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1653 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1654 (mantissa32 >> (13 + sub_exp)));
1657 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1658 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1659 mantissa16 = mantissa32 >> 13;
1663 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1665 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1673 const Vec128<float, N>
v) {
1674 Vec128<bfloat16_t, N> ret;
1675 for (
size_t i = 0; i <
N; ++i) {
1684template <
typename FromT,
typename ToT,
size_t N>
1688 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1690 for (
size_t i = 0; i <
N; ++i) {
1693 const double f =
static_cast<double>(from.
raw[i]);
1694 if (std::isinf(from.
raw[i]) ||
1695 std::fabs(f) >
static_cast<double>(LimitsMax<ToT>())) {
1697 std::signbit(from.
raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1700 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1705template <
typename FromT,
typename ToT,
size_t N>
1709 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1711 for (
size_t i = 0; i <
N; ++i) {
1713 ret.
raw[i] =
static_cast<ToT
>(from.
raw[i]);
1720template <
typename FromT,
typename ToT,
size_t N>
1736 for (
size_t i = 0; i <
N; ++i) {
1737 ret.
raw[i] =
static_cast<uint8_t
>(
v.raw[i] & 0xFF);
1746 for (
size_t i = 0; i <
N; ++i) {
1747 ret.
raw[i] =
static_cast<uint16_t
>(
v.raw[i] & 0xFFFF);
1756 for (
size_t i = 0; i <
N; ++i) {
1757 ret.
raw[i] =
static_cast<uint32_t
>(
v.raw[i] & 0xFFFFFFFFu);
1766 for (
size_t i = 0; i <
N; ++i) {
1767 ret.
raw[i] =
static_cast<uint8_t
>(
v.raw[i] & 0xFF);
1776 for (
size_t i = 0; i <
N; ++i) {
1777 ret.
raw[i] =
static_cast<uint16_t
>(
v.raw[i] & 0xFFFF);
1786 for (
size_t i = 0; i <
N; ++i) {
1787 ret.
raw[i] =
static_cast<uint8_t
>(
v.raw[i] & 0xFF);
1794template <
typename T,
size_t N>
1801template <
typename T,
size_t N>
1807template <
typename T,
size_t N>
1815template <
typename T,
size_t N>
1817 Vec128<T, N / 2>
v) {
1823template <
typename T,
size_t N>
1832template <
typename T,
size_t N>
1841template <
typename T,
size_t N>
1850template <
typename T,
size_t N>
1860template <
typename T,
size_t N>
1864 CopyBytes<
N / 2 *
sizeof(T)>(lo.raw, &ret.raw[0]);
1865 CopyBytes<
N / 2 *
sizeof(T)>(&hi.raw[
N / 2], &ret.raw[
N / 2]);
1869template <
typename T,
size_t N>
1873 for (
size_t i = 0; i <
N / 2; ++i) {
1874 ret.
raw[i] = lo.
raw[2 * i];
1876 for (
size_t i = 0; i <
N / 2; ++i) {
1877 ret.
raw[
N / 2 + i] = hi.
raw[2 * i];
1882template <
typename T,
size_t N>
1886 for (
size_t i = 0; i <
N / 2; ++i) {
1887 ret.
raw[i] = lo.
raw[2 * i + 1];
1889 for (
size_t i = 0; i <
N / 2; ++i) {
1890 ret.
raw[
N / 2 + i] = hi.
raw[2 * i + 1];
1897template <
int kBytes,
typename T,
size_t N,
class V = Vec128<T, N>>
1901 reinterpret_cast<const uint8_t *
HWY_RESTRICT>(lo.raw);
1904 CopyBytes<
sizeof(T) *
N - kBytes>(lo8 + kBytes, ret8);
1905 CopyBytes<kBytes>(hi.raw, ret8 +
sizeof(T) *
N - kBytes);
1911template <
int kBytes,
typename T,
size_t N>
1913 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1917 ZeroBytes<kBytes>(ret8);
1918 CopyBytes<
sizeof(T) *
N - kBytes>(
v.raw, ret8 + kBytes);
1922template <
int kBytes,
typename T,
size_t N>
1924 return ShiftLeftBytes<kBytes>(
DFromV<
decltype(
v)>(),
v);
1929template <
int kLanes,
typename T,
size_t N>
1935template <
int kLanes,
typename T,
size_t N>
1937 return ShiftLeftLanes<kLanes>(
DFromV<
decltype(
v)>(),
v);
1941template <
int kBytes,
typename T,
size_t N>
1943 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1949 CopyBytes<
sizeof(T) *
N - kBytes>(v8 + kBytes, ret8);
1950 ZeroBytes<kBytes>(ret8 +
sizeof(T) *
N - kBytes);
1955template <
int kLanes,
typename T,
size_t N>
1963template <
typename T,
size_t N>
1968template <
typename T,
size_t N>
1974template <
typename T,
size_t N>
1979template <
typename T,
size_t N>
1981 for (
size_t i = 0; i <
N; i += 2) {
1982 v.
raw[i + 1] =
v.raw[i];
1987template <
typename T,
size_t N>
1989 for (
size_t i = 0; i <
N; i += 2) {
1990 v.
raw[i] =
v.raw[i + 1];
1995template <
typename T,
size_t N>
1996HWY_API Vec128<T, N>
OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
1997 for (
size_t i = 0; i <
N; i += 2) {
1998 odd.raw[i] = even.raw[i];
2003template <
typename T,
size_t N>
2010template <
typename T,
size_t N>
2018template <
typename T,
size_t N>
2023template <
typename T,
size_t N,
typename TI>
2025 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
2027 CopyBytes<N * sizeof(T)>(vec.
raw, ret.raw);
2031template <
typename T,
size_t N,
typename TI>
2036template <
typename T,
size_t N>
2038 const Indices128<T, N> idx) {
2040 for (
size_t i = 0; i <
N; ++i) {
2041 ret.raw[i] =
v.raw[idx.raw[i]];
2049template <
typename T,
size_t N>
2057template <
typename T,
size_t N>
2060 for (
size_t i = 0; i <
N; ++i) {
2061 ret.
raw[i] =
v.raw[
N - 1 - i];
2066template <
typename T,
size_t N>
2069 for (
size_t i = 0; i <
N; i += 2) {
2070 ret.
raw[i + 0] =
v.raw[i + 1];
2071 ret.
raw[i + 1] =
v.raw[i + 0];
2076template <
typename T,
size_t N>
2079 for (
size_t i = 0; i <
N; i += 4) {
2080 ret.
raw[i + 0] =
v.raw[i + 3];
2081 ret.
raw[i + 1] =
v.raw[i + 2];
2082 ret.
raw[i + 2] =
v.raw[i + 1];
2083 ret.
raw[i + 3] =
v.raw[i + 0];
2088template <
typename T,
size_t N>
2091 for (
size_t i = 0; i <
N; i += 8) {
2092 ret.
raw[i + 0] =
v.raw[i + 7];
2093 ret.
raw[i + 1] =
v.raw[i + 6];
2094 ret.
raw[i + 2] =
v.raw[i + 5];
2095 ret.
raw[i + 3] =
v.raw[i + 4];
2096 ret.
raw[i + 4] =
v.raw[i + 3];
2097 ret.
raw[i + 5] =
v.raw[i + 2];
2098 ret.
raw[i + 6] =
v.raw[i + 1];
2099 ret.
raw[i + 7] =
v.raw[i + 0];
2109template <
typename T,
size_t N>
2111 static_assert(
sizeof(T) == 4,
"Only for 32-bit");
2112 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2117template <
typename T>
2119 static_assert(
sizeof(T) == 4,
"Only for 32-bit");
2121 ret.raw[3] =
v.raw[1];
2122 ret.raw[2] =
v.raw[0];
2123 ret.raw[1] =
v.raw[3];
2124 ret.raw[0] =
v.raw[2];
2127template <
typename T>
2129 static_assert(
sizeof(T) == 8,
"Only for 64-bit");
2134template <
typename T>
2137 ret.raw[3] =
v.raw[0];
2138 ret.raw[2] =
v.raw[3];
2139 ret.raw[1] =
v.raw[2];
2140 ret.raw[0] =
v.raw[1];
2145template <
typename T>
2148 ret.raw[3] =
v.raw[2];
2149 ret.raw[2] =
v.raw[1];
2150 ret.raw[1] =
v.raw[0];
2151 ret.raw[0] =
v.raw[3];
2155template <
typename T>
2162template <
int kLane,
typename T,
size_t N>
2164 for (
size_t i = 0; i <
N; ++i) {
2165 v.
raw[i] =
v.raw[kLane];
2172template <
typename T,
size_t N,
typename TI,
size_t NI>
2178 reinterpret_cast<const uint8_t*
>(indices.
raw);
2182 for (
size_t i = 0; i < NI *
sizeof(TI); ++i) {
2183 const size_t idx = idx_bytes[i];
2185 ret_bytes[i] = idx <
sizeof(T) *
N ? v_bytes[idx] : 0;
2190template <
typename T,
size_t N,
typename TI,
size_t NI>
2199template <
typename T,
size_t N>
2203 for (
size_t i = 0; i <
N / 2; ++i) {
2204 ret.
raw[2 * i + 0] = a.
raw[i];
2205 ret.
raw[2 * i + 1] = b.
raw[i];
2216template <
typename T,
size_t N>
2221 for (
size_t i = 0; i <
N / 2; ++i) {
2222 ret.
raw[2 * i + 0] = a.
raw[
N / 2 + i];
2223 ret.
raw[2 * i + 1] = b.
raw[
N / 2 + i];
2232template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2236template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2241template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2248template <
typename T,
size_t N>
2251 for (
size_t i = 0; i <
N; ++i) {
2252 or_sum |= mask.bits[i];
2257template <
typename T,
size_t N>
2259 constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
2260 uint64_t and_sum = kAll;
2261 for (
size_t i = 0; i <
N; ++i) {
2262 and_sum &= mask.
bits[i];
2264 return and_sum == kAll;
2268template <
typename T,
size_t N>
2272 for (
size_t i = 0; i <
N; ++i) {
2273 const size_t bit =
size_t{1} << (i & 7);
2274 const size_t idx_byte = i >> 3;
2281template <
typename T,
size_t N>
2285 if (
N > 8) bits[1] = 0;
2286 for (
size_t i = 0; i <
N; ++i) {
2287 const size_t bit =
size_t{1} << (i & 7);
2288 const size_t idx_byte = i >> 3;
2290 bits[idx_byte] =
static_cast<uint8_t
>(bits[idx_byte] | bit);
2293 return N > 8 ? 2 : 1;
2296template <
typename T,
size_t N>
2299 for (
size_t i = 0; i <
N; ++i) {
2300 count += mask.
bits[i] != 0;
2305template <
typename T,
size_t N>
2307 const Mask128<T, N> mask) {
2308 for (
size_t i = 0; i <
N; ++i) {
2309 if (mask.bits[i] != 0)
return i;
2315template <
typename T,
size_t N>
2317 const Mask128<T, N> mask) {
2318 for (
size_t i = 0; i <
N; ++i) {
2319 if (mask.bits[i] != 0)
return static_cast<intptr_t
>(i);
2321 return intptr_t{-1};
2326template <
typename T>
2327struct CompressIsPartition {
2328 enum {
value = (
sizeof(T) != 1) };
2331template <
typename T,
size_t N>
2335 for (
size_t i = 0; i <
N; ++i) {
2337 ret.
raw[count++] =
v.raw[i];
2340 for (
size_t i = 0; i <
N; ++i) {
2341 if (!mask.
bits[i]) {
2342 ret.
raw[count++] =
v.raw[i];
2350template <
typename T,
size_t N>
2354 for (
size_t i = 0; i <
N; ++i) {
2355 if (!mask.
bits[i]) {
2356 ret.
raw[count++] =
v.raw[i];
2359 for (
size_t i = 0; i <
N; ++i) {
2361 ret.
raw[count++] =
v.raw[i];
2370 Mask128<uint64_t> ) {
2375template <
typename T,
size_t N>
2382template <
typename T,
size_t N>
2387 for (
size_t i = 0; i <
N; ++i) {
2389 unaligned[count++] =
v.raw[i];
2396template <
typename T,
size_t N>
2404template <
typename T,
size_t N>
2417 Vec128<bfloat16_t, 2 * N> a,
2418 Vec128<bfloat16_t, 2 * N> b,
2419 const Vec128<float, N> sum0,
2420 Vec128<float, N>& sum1) {
2421 const Rebind<uint32_t,
decltype(df32)> du32;
2422 using VU32 =
VFromD<
decltype(du32)>;
2423 const VU32 odd =
Set(du32, 0xFFFF0000u);
2425 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
2427 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
2437 using VI32 =
VFromD<
decltype(d32)>;
2439 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, a)));
2440 const VI32 be = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, b)));
2441 const VI32 ao = ShiftRight<16>(
BitCast(d32, a));
2442 const VI32 bo = ShiftRight<16>(
BitCast(d32, b));
2443 sum1 = Add(Mul(ao, bo), sum1);
2444 return Add(Mul(ae, be), sum0);
2450 return Add(sum0, sum1);
2455template <
typename T,
size_t N>
2458 for (
size_t i = 0; i <
N; ++i) {
2463template <
typename T,
size_t N>
2465 T min = HighestValue<T>();
2466 for (
size_t i = 0; i <
N; ++i) {
2471template <
typename T,
size_t N>
2473 T max = LowestValue<T>();
2474 for (
size_t i = 0; i <
N; ++i) {
2485 const Vec128<uint64_t> b) {
2486 alignas(16) uint64_t mul[2];
2488 return Load(Full128<uint64_t>(), mul);
2492 const Vec128<uint64_t> b) {
2493 alignas(16) uint64_t mul[2];
2494 const Half<Full128<uint64_t>> d2;
2497 return Load(Full128<uint64_t>(), mul);
#define HWY_MAX(a, b)
Definition base.h:135
#define HWY_RESTRICT
Definition base.h:64
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_DASSERT(condition)
Definition base.h:238
Definition arm_neon-inl.h:825
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:827
static HWY_INLINE Raw FromBool(bool b)
Definition emu128-inl.h:77
Raw bits[16/sizeof(T)]
Definition emu128-inl.h:82
Definition arm_neon-inl.h:778
HWY_INLINE Vec128()=default
T PrivateT
Definition arm_neon-inl.h:782
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition emu128-inl.h:46
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition emu128-inl.h:52
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition emu128-inl.h:61
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition emu128-inl.h:58
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition emu128-inl.h:43
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition emu128-inl.h:55
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition emu128-inl.h:49
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition emu128-inl.h:633
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:535
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:545
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_API Vec128< ToT, N > ConvertTo(hwy::FloatTag, Simd< ToT, N, 0 >, Vec128< FromT, N > from)
Definition emu128-inl.h:1685
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE void StoreU16ToF16(const uint16_t val, hwy::float16_t *HWY_RESTRICT to)
Definition emu128-inl.h:1576
HWY_INLINE uint16_t U16FromF16(const hwy::float16_t *HWY_RESTRICT from)
Definition emu128-inl.h:1581
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:748
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6549
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API float F32FromBF16(bfloat16_t bf)
Definition base.h:975
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
HWY_API bfloat16_t BF16FromF32(float f)
Definition base.h:983
constexpr float HighestValue< float >()
Definition base.h:688
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:961
constexpr float LowestValue< float >()
Definition base.h:675
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
HWY_API constexpr T LimitsMax()
Definition base.h:656
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
@ value
Definition arm_neon-inl.h:5730
Definition arm_neon-inl.h:3968
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:3969
Definition ops/shared-inl.h:52
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition emu128-inl.h:432
Definition emu128-inl.h:422
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v) const
Definition emu128-inl.h:424