enhance: Improve bitset performance for AVX512 (#35479)

see #35478

optimized functions take 20%+ less time to run

Signed-off-by: Alexandr Guzhva <alexanderguzhva@gmail.com>
pull/35508/head
Alexander Guzhva 2024-08-15 19:44:53 -04:00 committed by GitHub
parent 1bbf7a3c0e
commit b896143965
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 438 additions and 18 deletions

View File

@ -48,6 +48,11 @@ get_mask(const size_t count) {
///////////////////////////////////////////////////////////////////////////
constexpr size_t N_BLOCKS = 8;
constexpr size_t PAGE_SIZE = 4096;
constexpr size_t BLOCKS_PREFETCH_AHEAD = 4;
constexpr size_t CACHELINE_WIDTH = 0x40;
//
template <CompareOpType Op>
bool
@ -65,9 +70,30 @@ OpCompareValImpl<int8_t, Op>::op_compare_val(uint8_t* const __restrict res_u8,
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int8_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 64) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i v =
_mm512_loadu_si512(src + i + p + ip * BLOCK_COUNT);
const __mmask64 cmp_mask =
_mm512_cmp_epi8_mask(v, target, pred);
res_u64[(i + p + ip * BLOCK_COUNT) / 64] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size64 = (size / 64) * 64;
for (size_t i = 0; i < size64; i += 64) {
for (size_t i = size_8p; i < size64; i += 64) {
const __m512i v = _mm512_loadu_si512(src + i);
const __mmask64 cmp_mask = _mm512_cmp_epi8_mask(v, target, pred);
@ -107,9 +133,30 @@ OpCompareValImpl<int16_t, Op>::op_compare_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int16_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 32) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i v =
_mm512_loadu_si512(src + i + p + ip * BLOCK_COUNT);
const __mmask32 cmp_mask =
_mm512_cmp_epi16_mask(v, target, pred);
res_u32[(i + p + ip * BLOCK_COUNT) / 32] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size32 = (size / 32) * 32;
for (size_t i = 0; i < size32; i += 32) {
for (size_t i = size_8p; i < size32; i += 32) {
const __m512i v = _mm512_loadu_si512(src + i);
const __mmask32 cmp_mask = _mm512_cmp_epi16_mask(v, target, pred);
@ -149,9 +196,30 @@ OpCompareValImpl<int32_t, Op>::op_compare_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int32_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i v =
_mm512_loadu_si512(src + i + p + ip * BLOCK_COUNT);
const __mmask16 cmp_mask =
_mm512_cmp_epi32_mask(v, target, pred);
res_u16[(i + p + ip * BLOCK_COUNT) / 16] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512i v = _mm512_loadu_si512(src + i);
const __mmask16 cmp_mask = _mm512_cmp_epi32_mask(v, target, pred);
@ -187,9 +255,30 @@ OpCompareValImpl<int64_t, Op>::op_compare_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int64_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i v =
_mm512_loadu_si512(src + i + p + ip * BLOCK_COUNT);
const __mmask8 cmp_mask =
_mm512_cmp_epi64_mask(v, target, pred);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512i v = _mm512_loadu_si512(src + i);
const __mmask8 cmp_mask = _mm512_cmp_epi64_mask(v, target, pred);
@ -216,9 +305,29 @@ OpCompareValImpl<float, Op>::op_compare_val(uint8_t* const __restrict res_u8,
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(float);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512 v =
_mm512_loadu_ps(src + i + p + ip * BLOCK_COUNT);
const __mmask16 cmp_mask = _mm512_cmp_ps_mask(v, target, pred);
res_u16[(i + p + ip * BLOCK_COUNT) / 16] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512 v = _mm512_loadu_ps(src + i);
const __mmask16 cmp_mask = _mm512_cmp_ps_mask(v, target, pred);
@ -254,9 +363,29 @@ OpCompareValImpl<double, Op>::op_compare_val(uint8_t* const __restrict res_u8,
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(double);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512d v =
_mm512_loadu_pd(src + i + p + ip * BLOCK_COUNT);
const __mmask8 cmp_mask = _mm512_cmp_pd_mask(v, target, pred);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512d v = _mm512_loadu_pd(src + i);
const __mmask8 cmp_mask = _mm512_cmp_pd_mask(v, target, pred);
@ -792,9 +921,32 @@ OpWithinRangeValImpl<int8_t, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int8_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 64) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i vv =
_mm512_loadu_si512(values + i + p + ip * BLOCK_COUNT);
const __mmask64 cmpl_mask =
_mm512_cmp_epi8_mask(lower_v, vv, pred_lower);
const __mmask64 cmp_mask = _mm512_mask_cmp_epi8_mask(
cmpl_mask, vv, upper_v, pred_upper);
res_u64[(i + p + ip * BLOCK_COUNT) / 64] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size64 = (size / 64) * 64;
for (size_t i = 0; i < size64; i += 64) {
for (size_t i = size_8p; i < size64; i += 64) {
const __m512i vv = _mm512_loadu_si512(values + i);
const __mmask64 cmpl_mask =
_mm512_cmp_epi8_mask(lower_v, vv, pred_lower);
@ -845,9 +997,32 @@ OpWithinRangeValImpl<int16_t, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int16_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 32) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i vv =
_mm512_loadu_si512(values + i + p + ip * BLOCK_COUNT);
const __mmask32 cmpl_mask =
_mm512_cmp_epi16_mask(lower_v, vv, pred_lower);
const __mmask32 cmp_mask = _mm512_mask_cmp_epi16_mask(
cmpl_mask, vv, upper_v, pred_upper);
res_u32[(i + p + ip * BLOCK_COUNT) / 32] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size32 = (size / 32) * 32;
for (size_t i = 0; i < size32; i += 32) {
for (size_t i = size_8p; i < size32; i += 32) {
const __m512i vv = _mm512_loadu_si512(values + i);
const __mmask32 cmpl_mask =
_mm512_cmp_epi16_mask(lower_v, vv, pred_lower);
@ -898,9 +1073,32 @@ OpWithinRangeValImpl<int32_t, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int32_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i vv =
_mm512_loadu_si512(values + i + p + ip * BLOCK_COUNT);
const __mmask16 cmpl_mask =
_mm512_cmp_epi32_mask(lower_v, vv, pred_lower);
const __mmask16 cmp_mask = _mm512_mask_cmp_epi32_mask(
cmpl_mask, vv, upper_v, pred_upper);
res_u16[(i + p + ip * BLOCK_COUNT) / 16] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512i vv = _mm512_loadu_si512(values + i);
const __mmask16 cmpl_mask =
_mm512_cmp_epi32_mask(lower_v, vv, pred_lower);
@ -947,9 +1145,32 @@ OpWithinRangeValImpl<int64_t, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(int64_t);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i vv =
_mm512_loadu_si512(values + i + p + ip * BLOCK_COUNT);
const __mmask8 cmpl_mask =
_mm512_cmp_epi64_mask(lower_v, vv, pred_lower);
const __mmask8 cmp_mask = _mm512_mask_cmp_epi64_mask(
cmpl_mask, vv, upper_v, pred_upper);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512i vv = _mm512_loadu_si512(values + i);
const __mmask8 cmpl_mask =
_mm512_cmp_epi64_mask(lower_v, vv, pred_lower);
@ -984,9 +1205,32 @@ OpWithinRangeValImpl<float, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(float);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512 vv =
_mm512_loadu_ps(values + i + p + ip * BLOCK_COUNT);
const __mmask16 cmpl_mask =
_mm512_cmp_ps_mask(lower_v, vv, pred_lower);
const __mmask16 cmp_mask =
_mm512_mask_cmp_ps_mask(cmpl_mask, vv, upper_v, pred_upper);
res_u16[(i + p + ip * BLOCK_COUNT) / 16] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512 vv = _mm512_loadu_ps(values + i);
const __mmask16 cmpl_mask = _mm512_cmp_ps_mask(lower_v, vv, pred_lower);
const __mmask16 cmp_mask =
@ -1031,9 +1275,32 @@ OpWithinRangeValImpl<double, Op>::op_within_range_val(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / sizeof(double);
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512d vv =
_mm512_loadu_pd(values + i + p + ip * BLOCK_COUNT);
const __mmask8 cmpl_mask =
_mm512_cmp_pd_mask(lower_v, vv, pred_lower);
const __mmask8 cmp_mask =
_mm512_mask_cmp_pd_mask(cmpl_mask, vv, upper_v, pred_upper);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(values + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512d vv = _mm512_loadu_pd(values + i);
const __mmask8 cmpl_mask = _mm512_cmp_pd_mask(lower_v, vv, pred_lower);
const __mmask8 cmp_mask =
@ -1196,9 +1463,40 @@ OpArithCompareImpl<int8_t, AOp, CmpOp>::op_arith_compare(
const __m512i right_v = _mm512_set1_epi64(right_operand);
const __m512i value_v = _mm512_set1_epi64(value);
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(int8_t));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m128i vs = _mm_loadu_si128(
(const __m128i*)(src + i + p + ip * BLOCK_COUNT));
const __m512i v0s = _mm512_cvtepi8_epi64(
_mm_unpacklo_epi64(vs, _mm_setzero_si128()));
const __m512i v1s = _mm512_cvtepi8_epi64(
_mm_unpackhi_epi64(vs, _mm_setzero_si128()));
const __mmask8 cmp_mask0 =
ArithHelperI64<AOp, CmpOp>::op(v0s, right_v, value_v);
const __mmask8 cmp_mask1 =
ArithHelperI64<AOp, CmpOp>::op(v1s, right_v, value_v);
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 0] = cmp_mask0;
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 1] = cmp_mask1;
if (p % CACHELINE_WIDTH == 0) {
_mm_prefetch(
(const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m128i vs = _mm_loadu_si128((const __m128i*)(src + i));
const __m512i v0s = _mm512_cvtepi8_epi64(
_mm_unpacklo_epi64(vs, _mm_setzero_si128()));
@ -1251,9 +1549,40 @@ OpArithCompareImpl<int16_t, AOp, CmpOp>::op_arith_compare(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(int16_t));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m256i vs = _mm256_loadu_si256(
(const __m256i*)(src + i + p + ip * BLOCK_COUNT));
const __m512i v0s =
_mm512_cvtepi16_epi64(_mm256_extracti128_si256(vs, 0));
const __m512i v1s =
_mm512_cvtepi16_epi64(_mm256_extracti128_si256(vs, 1));
const __mmask8 cmp_mask0 =
ArithHelperI64<AOp, CmpOp>::op(v0s, right_v, value_v);
const __mmask8 cmp_mask1 =
ArithHelperI64<AOp, CmpOp>::op(v1s, right_v, value_v);
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 0] = cmp_mask0;
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 1] = cmp_mask1;
if ((2 * p) % CACHELINE_WIDTH == 0) {
_mm_prefetch(
(const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m256i vs = _mm256_loadu_si256((const __m256i*)(src + i));
const __m512i v0s =
_mm512_cvtepi16_epi64(_mm256_extracti128_si256(vs, 0));
@ -1304,9 +1633,37 @@ OpArithCompareImpl<int32_t, AOp, CmpOp>::op_arith_compare(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(int32_t));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i vs = _mm512_loadu_si512(
(const __m512i*)(src + i + p + ip * BLOCK_COUNT));
const __m512i v0s =
_mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(vs, 0));
const __m512i v1s =
_mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(vs, 1));
const __mmask8 cmp_mask0 =
ArithHelperI64<AOp, CmpOp>::op(v0s, right_v, value_v);
const __mmask8 cmp_mask1 =
ArithHelperI64<AOp, CmpOp>::op(v1s, right_v, value_v);
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 0] = cmp_mask0;
res_u8[(i + p + ip * BLOCK_COUNT) / 8 + 1] = cmp_mask1;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512i vs = _mm512_loadu_si512((const __m512i*)(src + i));
const __m512i v0s =
_mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(vs, 0));
@ -1358,9 +1715,30 @@ OpArithCompareImpl<int64_t, AOp, CmpOp>::op_arith_compare(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(int64_t));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512i v0s = _mm512_loadu_si512(
(const __m512i*)(src + i + p + ip * BLOCK_COUNT));
const __mmask8 cmp_mask =
ArithHelperI64<AOp, CmpOp>::op(v0s, right_v, value_v);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512i v0s = _mm512_loadu_si512((const __m512i*)(src + i));
const __mmask8 cmp_mask =
ArithHelperI64<AOp, CmpOp>::op(v0s, right_v, value_v);
@ -1394,9 +1772,30 @@ OpArithCompareImpl<float, AOp, CmpOp>::op_arith_compare(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(float));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 16) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512 v0s =
_mm512_loadu_ps(src + i + p + ip * BLOCK_COUNT);
const __mmask16 cmp_mask =
ArithHelperF32<AOp, CmpOp>::op(v0s, right_v, value_v);
res_u16[(i + p + ip * BLOCK_COUNT) / 16] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size16 = (size / 16) * 16;
for (size_t i = 0; i < size16; i += 16) {
for (size_t i = size_8p; i < size16; i += 16) {
const __m512 v0s = _mm512_loadu_ps(src + i);
const __mmask16 cmp_mask =
ArithHelperF32<AOp, CmpOp>::op(v0s, right_v, value_v);
@ -1437,9 +1836,30 @@ OpArithCompareImpl<double, AOp, CmpOp>::op_arith_compare(
// todo: aligned reads & writes
// interleaved pages
constexpr size_t BLOCK_COUNT = PAGE_SIZE / (sizeof(int64_t));
const size_t size_8p =
(size / (N_BLOCKS * BLOCK_COUNT)) * N_BLOCKS * BLOCK_COUNT;
for (size_t i = 0; i < size_8p; i += N_BLOCKS * BLOCK_COUNT) {
for (size_t p = 0; p < BLOCK_COUNT; p += 8) {
for (size_t ip = 0; ip < N_BLOCKS; ip++) {
const __m512d v0s =
_mm512_loadu_pd(src + i + p + ip * BLOCK_COUNT);
const __mmask8 cmp_mask =
ArithHelperF64<AOp, CmpOp>::op(v0s, right_v, value_v);
res_u8[(i + p + ip * BLOCK_COUNT) / 8] = cmp_mask;
_mm_prefetch((const char*)(src + i + p + ip * BLOCK_COUNT) +
BLOCKS_PREFETCH_AHEAD * CACHELINE_WIDTH,
_MM_HINT_T0);
}
}
}
// process big blocks
const size_t size8 = (size / 8) * 8;
for (size_t i = 0; i < size8; i += 8) {
for (size_t i = size_8p; i < size8; i += 8) {
const __m512d v0s = _mm512_loadu_pd(src + i);
const __mmask8 cmp_mask =
ArithHelperF64<AOp, CmpOp>::op(v0s, right_v, value_v);