[x265] [PATCH 3/3] AArch64: Delete redundant SADx4 Neon intrinsics primitives
Jonathan Wright
jonathan.wright at arm.com
Tue Jan 7 16:27:14 UTC 2025
Delete the SADx4 Neon intrinsics primitives since we now have
optimized Neon assembly implementations for all block sizes and
bitdepths.
---
source/common/aarch64/pixel-prim.cpp | 153 ---------------------------
1 file changed, 153 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 947e2b132..52ce522ad 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -909,151 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
return sum;
}
-template<int lx, int ly>
-void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
- intptr_t frefstride, int32_t *res)
-{
- uint32x4_t result = vdupq_n_u32(0);
-
- for (int y = 0; y < ly; y++)
- {
- int x = 0;
- uint16x8_t vsum16_0 = vdupq_n_u16(0);
- uint16x8_t vsum16_1 = vdupq_n_u16(0);
- uint16x8_t vsum16_2 = vdupq_n_u16(0);
- uint16x8_t vsum16_3 = vdupq_n_u16(0);
-#if HIGH_BIT_DEPTH
- for (; (x + 16) <= lx; x += 16)
- {
- uint16x8x2_t p1 = vld1q_u16_x2(&pix1[x]);
- uint16x8x2_t p2 = vld1q_u16_x2(&pix2[x]);
- uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]);
- uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]);
- uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]);
- vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]);
- vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]);
- vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]);
- vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]);
- vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]);
- vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]);
- vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]);
- vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]);
- }
- if (lx & 8)
- {
- uint16x8_t p1 = vld1q_u16(pix1 + x);
- uint16x8_t p2 = vld1q_u16(pix2 + x);
- uint16x8_t p3 = vld1q_u16(pix3 + x);
- uint16x8_t p4 = vld1q_u16(pix4 + x);
- uint16x8_t p5 = vld1q_u16(pix5 + x);
- vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
- vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
- vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
- vsum16_3 = vabaq_u16(vsum16_3, p1, p5);
- x += 8;
- }
- if (lx & 4)
- {
- /* This is equivalent to getting the absolute difference of pix1[x] with each of
- * pix2 - pix5, then summing across the vector (4 values each) and adding the
- * result to result. */
- uint16x8_t p1 = vreinterpretq_u16_u64(
- vld1q_dup_u64((uint64_t *)&pix1[x]));
- uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x),
- vld1_u16(pix3 + x));
- uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x),
- vld1_u16(pix5 + x));
-
- uint16x8_t a = vabdq_u16(p1, p2_3);
- uint16x8_t b = vabdq_u16(p1, p4_5);
-
- result = vpadalq_u16(result, vpaddq_u16(a, b));
- x += 4;
- }
- if (lx >= 4)
- {
- /* This is equivalent to adding across each of the sum vectors and then adding
- * to result. */
- uint32x4_t sum01 = vpaddlq_u16(vpaddq_u16(vsum16_0, vsum16_1));
- uint32x4_t sum23 = vpaddlq_u16(vpaddq_u16(vsum16_2, vsum16_3));
- result = vaddq_u32(result, vpaddq_u32(sum01, sum23));
- }
-
-#else
-
- for (; (x + 16) <= lx; x += 16)
- {
- uint8x16_t p1 = vld1q_u8(pix1 + x);
- uint8x16_t p2 = vld1q_u8(pix2 + x);
- uint8x16_t p3 = vld1q_u8(pix3 + x);
- uint8x16_t p4 = vld1q_u8(pix4 + x);
- uint8x16_t p5 = vld1q_u8(pix5 + x);
- vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
- vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
- vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
- vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
- vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
- vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
- vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
- vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
- }
- if (lx & 8)
- {
- uint8x8_t p1 = vld1_u8(pix1 + x);
- uint8x8_t p2 = vld1_u8(pix2 + x);
- uint8x8_t p3 = vld1_u8(pix3 + x);
- uint8x8_t p4 = vld1_u8(pix4 + x);
- uint8x8_t p5 = vld1_u8(pix5 + x);
- vsum16_0 = vabal_u8(vsum16_0, p1, p2);
- vsum16_1 = vabal_u8(vsum16_1, p1, p3);
- vsum16_2 = vabal_u8(vsum16_2, p1, p4);
- vsum16_3 = vabal_u8(vsum16_3, p1, p5);
- x += 8;
- }
- if (lx & 4)
- {
- uint8x16_t p1 = vreinterpretq_u8_u32(
- vld1q_dup_u32((uint32_t *)&pix1[x]));
-
- uint32x4_t p_x4 = vdupq_n_u32(0);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix2[x], p_x4, 0);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix3[x], p_x4, 1);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3);
-
- uint16x8_t sum = vabdl_u8(vget_low_u8(p1),
- vget_low_u8(vreinterpretq_u8_u32(p_x4)));
- uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4));
-
- result = vpadalq_u16(result, vpaddq_u16(sum, sum2));
- }
- if (lx >= 4)
- {
- result[0] += vaddvq_u16(vsum16_0);
- result[1] += vaddvq_u16(vsum16_1);
- result[2] += vaddvq_u16(vsum16_2);
- result[3] += vaddvq_u16(vsum16_3);
- }
-
-#endif
- if (lx & 3) for (; x < lx; x++)
- {
- result[0] += abs(pix1[x] - pix2[x]);
- result[1] += abs(pix1[x] - pix3[x]);
- result[2] += abs(pix1[x] - pix4[x]);
- result[3] += abs(pix1[x] - pix5[x]);
- }
-
- pix1 += FENC_STRIDE;
- pix2 += frefstride;
- pix3 += frefstride;
- pix4 += frefstride;
- pix5 += frefstride;
- }
-
- vst1q_s32(res, vreinterpretq_s32_u32(result));
-}
-
template<int bx, int by>
void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
@@ -1541,7 +1396,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
- p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
@@ -1555,7 +1409,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
- p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
#endif // !(HIGH_BIT_DEPTH)
@@ -1600,12 +1453,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_PU(64, 16);
LUMA_PU(16, 64);
-#if !(HIGH_BIT_DEPTH)
- p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>;
- p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>;
- p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>;
-#endif // !(HIGH_BIT_DEPTH)
-
p.pu[LUMA_4x4].satd = pixel_satd_4x4_neon;
p.pu[LUMA_8x4].satd = pixel_satd_8x4_neon;
--
2.39.3 (Apple Git-146)
-------------- next part --------------
>From 10540d451a93c8f88442a4e6b3070322e58fbe92 Mon Sep 17 00:00:00 2001
Message-Id: <10540d451a93c8f88442a4e6b3070322e58fbe92.1736263010.git.jonathan.wright at arm.com>
In-Reply-To: <cover.1736263010.git.jonathan.wright at arm.com>
References: <cover.1736263010.git.jonathan.wright at arm.com>
From: Jonathan Wright <jonathan.wright at arm.com>
Date: Mon, 9 Dec 2024 11:48:10 +0000
Subject: [PATCH 3/3] AArch64: Delete redundant SADx4 Neon intrinsics
primitives
Delete the SADx4 Neon intrinsics primitives since we now have
optimized Neon assembly implementations for all block sizes and
bitdepths.
---
source/common/aarch64/pixel-prim.cpp | 153 ---------------------------
1 file changed, 153 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 947e2b132..52ce522ad 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -909,151 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
return sum;
}
-template<int lx, int ly>
-void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
- intptr_t frefstride, int32_t *res)
-{
- uint32x4_t result = vdupq_n_u32(0);
-
- for (int y = 0; y < ly; y++)
- {
- int x = 0;
- uint16x8_t vsum16_0 = vdupq_n_u16(0);
- uint16x8_t vsum16_1 = vdupq_n_u16(0);
- uint16x8_t vsum16_2 = vdupq_n_u16(0);
- uint16x8_t vsum16_3 = vdupq_n_u16(0);
-#if HIGH_BIT_DEPTH
- for (; (x + 16) <= lx; x += 16)
- {
- uint16x8x2_t p1 = vld1q_u16_x2(&pix1[x]);
- uint16x8x2_t p2 = vld1q_u16_x2(&pix2[x]);
- uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]);
- uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]);
- uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]);
- vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]);
- vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]);
- vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]);
- vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]);
- vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]);
- vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]);
- vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]);
- vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]);
- }
- if (lx & 8)
- {
- uint16x8_t p1 = vld1q_u16(pix1 + x);
- uint16x8_t p2 = vld1q_u16(pix2 + x);
- uint16x8_t p3 = vld1q_u16(pix3 + x);
- uint16x8_t p4 = vld1q_u16(pix4 + x);
- uint16x8_t p5 = vld1q_u16(pix5 + x);
- vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
- vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
- vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
- vsum16_3 = vabaq_u16(vsum16_3, p1, p5);
- x += 8;
- }
- if (lx & 4)
- {
- /* This is equivalent to getting the absolute difference of pix1[x] with each of
- * pix2 - pix5, then summing across the vector (4 values each) and adding the
- * result to result. */
- uint16x8_t p1 = vreinterpretq_u16_u64(
- vld1q_dup_u64((uint64_t *)&pix1[x]));
- uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x),
- vld1_u16(pix3 + x));
- uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x),
- vld1_u16(pix5 + x));
-
- uint16x8_t a = vabdq_u16(p1, p2_3);
- uint16x8_t b = vabdq_u16(p1, p4_5);
-
- result = vpadalq_u16(result, vpaddq_u16(a, b));
- x += 4;
- }
- if (lx >= 4)
- {
- /* This is equivalent to adding across each of the sum vectors and then adding
- * to result. */
- uint32x4_t sum01 = vpaddlq_u16(vpaddq_u16(vsum16_0, vsum16_1));
- uint32x4_t sum23 = vpaddlq_u16(vpaddq_u16(vsum16_2, vsum16_3));
- result = vaddq_u32(result, vpaddq_u32(sum01, sum23));
- }
-
-#else
-
- for (; (x + 16) <= lx; x += 16)
- {
- uint8x16_t p1 = vld1q_u8(pix1 + x);
- uint8x16_t p2 = vld1q_u8(pix2 + x);
- uint8x16_t p3 = vld1q_u8(pix3 + x);
- uint8x16_t p4 = vld1q_u8(pix4 + x);
- uint8x16_t p5 = vld1q_u8(pix5 + x);
- vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
- vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
- vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
- vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
- vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
- vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
- vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
- vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
- }
- if (lx & 8)
- {
- uint8x8_t p1 = vld1_u8(pix1 + x);
- uint8x8_t p2 = vld1_u8(pix2 + x);
- uint8x8_t p3 = vld1_u8(pix3 + x);
- uint8x8_t p4 = vld1_u8(pix4 + x);
- uint8x8_t p5 = vld1_u8(pix5 + x);
- vsum16_0 = vabal_u8(vsum16_0, p1, p2);
- vsum16_1 = vabal_u8(vsum16_1, p1, p3);
- vsum16_2 = vabal_u8(vsum16_2, p1, p4);
- vsum16_3 = vabal_u8(vsum16_3, p1, p5);
- x += 8;
- }
- if (lx & 4)
- {
- uint8x16_t p1 = vreinterpretq_u8_u32(
- vld1q_dup_u32((uint32_t *)&pix1[x]));
-
- uint32x4_t p_x4 = vdupq_n_u32(0);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix2[x], p_x4, 0);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix3[x], p_x4, 1);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2);
- p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3);
-
- uint16x8_t sum = vabdl_u8(vget_low_u8(p1),
- vget_low_u8(vreinterpretq_u8_u32(p_x4)));
- uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4));
-
- result = vpadalq_u16(result, vpaddq_u16(sum, sum2));
- }
- if (lx >= 4)
- {
- result[0] += vaddvq_u16(vsum16_0);
- result[1] += vaddvq_u16(vsum16_1);
- result[2] += vaddvq_u16(vsum16_2);
- result[3] += vaddvq_u16(vsum16_3);
- }
-
-#endif
- if (lx & 3) for (; x < lx; x++)
- {
- result[0] += abs(pix1[x] - pix2[x]);
- result[1] += abs(pix1[x] - pix3[x]);
- result[2] += abs(pix1[x] - pix4[x]);
- result[3] += abs(pix1[x] - pix5[x]);
- }
-
- pix1 += FENC_STRIDE;
- pix2 += frefstride;
- pix3 += frefstride;
- pix4 += frefstride;
- pix5 += frefstride;
- }
-
- vst1q_s32(res, vreinterpretq_s32_u32(result));
-}
-
template<int bx, int by>
void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
@@ -1541,7 +1396,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
- p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
@@ -1555,7 +1409,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
- p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
#endif // !(HIGH_BIT_DEPTH)
@@ -1600,12 +1453,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_PU(64, 16);
LUMA_PU(16, 64);
-#if !(HIGH_BIT_DEPTH)
- p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>;
- p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>;
- p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>;
-#endif // !(HIGH_BIT_DEPTH)
-
p.pu[LUMA_4x4].satd = pixel_satd_4x4_neon;
p.pu[LUMA_8x4].satd = pixel_satd_8x4_neon;
--
2.39.3 (Apple Git-146)
More information about the x265-devel
mailing list