[x265] [PATCH 3/3] AArch64: Delete redundant SADx4 Neon intrinsics primitives

Jonathan Wright jonathan.wright at arm.com
Tue Jan 7 16:27:14 UTC 2025


Delete the SADx4 Neon intrinsics primitives since we now have
optimized Neon assembly implementations for all block sizes and
bitdepths.
---
 source/common/aarch64/pixel-prim.cpp | 153 ---------------------------
 1 file changed, 153 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 947e2b132..52ce522ad 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -909,151 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
     return sum;
 }
 
-template<int lx, int ly>
-void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
-                 intptr_t frefstride, int32_t *res)
-{
-    uint32x4_t result = vdupq_n_u32(0);
-
-    for (int y = 0; y < ly; y++)
-    {
-        int x = 0;
-        uint16x8_t vsum16_0 = vdupq_n_u16(0);
-        uint16x8_t vsum16_1 = vdupq_n_u16(0);
-        uint16x8_t vsum16_2 = vdupq_n_u16(0);
-        uint16x8_t vsum16_3 = vdupq_n_u16(0);
-#if HIGH_BIT_DEPTH
-        for (; (x + 16) <= lx; x += 16)
-        {
-            uint16x8x2_t p1 = vld1q_u16_x2(&pix1[x]);
-            uint16x8x2_t p2 = vld1q_u16_x2(&pix2[x]);
-            uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]);
-            uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]);
-            uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]);
-            vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]);
-            vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]);
-            vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]);
-            vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]);
-            vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]);
-            vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]);
-            vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]);
-            vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]);
-        }
-        if (lx & 8)
-        {
-            uint16x8_t p1 = vld1q_u16(pix1 + x);
-            uint16x8_t p2 = vld1q_u16(pix2 + x);
-            uint16x8_t p3 = vld1q_u16(pix3 + x);
-            uint16x8_t p4 = vld1q_u16(pix4 + x);
-            uint16x8_t p5 = vld1q_u16(pix5 + x);
-            vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
-            vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
-            vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
-            vsum16_3 = vabaq_u16(vsum16_3, p1, p5);
-            x += 8;
-        }
-        if (lx & 4)
-        {
-            /* This is equivalent to getting the absolute difference of pix1[x] with each of
-             * pix2 - pix5, then summing across the vector (4 values each) and adding the
-             * result to result. */
-            uint16x8_t p1 = vreinterpretq_u16_u64(
-                vld1q_dup_u64((uint64_t *)&pix1[x]));
-            uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x),
-                                           vld1_u16(pix3 + x));
-            uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x),
-                                           vld1_u16(pix5 + x));
-
-            uint16x8_t a = vabdq_u16(p1, p2_3);
-            uint16x8_t b = vabdq_u16(p1, p4_5);
-
-            result = vpadalq_u16(result, vpaddq_u16(a, b));
-            x += 4;
-        }
-        if (lx >= 4)
-        {
-            /* This is equivalent to adding across each of the sum vectors and then adding
-             * to result. */
-            uint32x4_t sum01 = vpaddlq_u16(vpaddq_u16(vsum16_0, vsum16_1));
-            uint32x4_t sum23 = vpaddlq_u16(vpaddq_u16(vsum16_2, vsum16_3));
-            result = vaddq_u32(result, vpaddq_u32(sum01, sum23));
-        }
-
-#else
-
-        for (; (x + 16) <= lx; x += 16)
-        {
-            uint8x16_t p1 = vld1q_u8(pix1 + x);
-            uint8x16_t p2 = vld1q_u8(pix2 + x);
-            uint8x16_t p3 = vld1q_u8(pix3 + x);
-            uint8x16_t p4 = vld1q_u8(pix4 + x);
-            uint8x16_t p5 = vld1q_u8(pix5 + x);
-            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
-            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
-            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
-            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
-            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
-            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
-            vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
-            vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
-        }
-        if (lx & 8)
-        {
-            uint8x8_t p1 = vld1_u8(pix1 + x);
-            uint8x8_t p2 = vld1_u8(pix2 + x);
-            uint8x8_t p3 = vld1_u8(pix3 + x);
-            uint8x8_t p4 = vld1_u8(pix4 + x);
-            uint8x8_t p5 = vld1_u8(pix5 + x);
-            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
-            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
-            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
-            vsum16_3 = vabal_u8(vsum16_3, p1, p5);
-            x += 8;
-        }
-        if (lx & 4)
-        {
-            uint8x16_t p1 = vreinterpretq_u8_u32(
-                vld1q_dup_u32((uint32_t *)&pix1[x]));
-
-            uint32x4_t p_x4 = vdupq_n_u32(0);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix2[x], p_x4, 0);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix3[x], p_x4, 1);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3);
-
-            uint16x8_t sum = vabdl_u8(vget_low_u8(p1),
-                                      vget_low_u8(vreinterpretq_u8_u32(p_x4)));
-            uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4));
-
-            result = vpadalq_u16(result, vpaddq_u16(sum, sum2));
-        }
-        if (lx >= 4)
-        {
-            result[0] += vaddvq_u16(vsum16_0);
-            result[1] += vaddvq_u16(vsum16_1);
-            result[2] += vaddvq_u16(vsum16_2);
-            result[3] += vaddvq_u16(vsum16_3);
-        }
-
-#endif
-        if (lx & 3) for (; x < lx; x++)
-        {
-            result[0] += abs(pix1[x] - pix2[x]);
-            result[1] += abs(pix1[x] - pix3[x]);
-            result[2] += abs(pix1[x] - pix4[x]);
-            result[3] += abs(pix1[x] - pix5[x]);
-        }
-
-        pix1 += FENC_STRIDE;
-        pix2 += frefstride;
-        pix3 += frefstride;
-        pix4 += frefstride;
-        pix5 += frefstride;
-    }
-
-    vst1q_s32(res, vreinterpretq_s32_u32(result));
-}
-
 
 template<int bx, int by>
 void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
@@ -1541,7 +1396,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
-    p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
 
@@ -1555,7 +1409,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
-    p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
 #endif // !(HIGH_BIT_DEPTH)
@@ -1600,12 +1453,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
-#if !(HIGH_BIT_DEPTH)
-    p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>;
-    p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>;
-    p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>;
-#endif // !(HIGH_BIT_DEPTH)
-
     p.pu[LUMA_4x4].satd   = pixel_satd_4x4_neon;
     p.pu[LUMA_8x4].satd   = pixel_satd_8x4_neon;
     
-- 
2.39.3 (Apple Git-146)

-------------- next part --------------
>From 10540d451a93c8f88442a4e6b3070322e58fbe92 Mon Sep 17 00:00:00 2001
Message-Id: <10540d451a93c8f88442a4e6b3070322e58fbe92.1736263010.git.jonathan.wright at arm.com>
In-Reply-To: <cover.1736263010.git.jonathan.wright at arm.com>
References: <cover.1736263010.git.jonathan.wright at arm.com>
From: Jonathan Wright <jonathan.wright at arm.com>
Date: Mon, 9 Dec 2024 11:48:10 +0000
Subject: [PATCH 3/3] AArch64: Delete redundant SADx4 Neon intrinsics
 primitives

Delete the SADx4 Neon intrinsics primitives since we now have
optimized Neon assembly implementations for all block sizes and
bitdepths.
---
 source/common/aarch64/pixel-prim.cpp | 153 ---------------------------
 1 file changed, 153 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 947e2b132..52ce522ad 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -909,151 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
     return sum;
 }
 
-template<int lx, int ly>
-void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
-                 intptr_t frefstride, int32_t *res)
-{
-    uint32x4_t result = vdupq_n_u32(0);
-
-    for (int y = 0; y < ly; y++)
-    {
-        int x = 0;
-        uint16x8_t vsum16_0 = vdupq_n_u16(0);
-        uint16x8_t vsum16_1 = vdupq_n_u16(0);
-        uint16x8_t vsum16_2 = vdupq_n_u16(0);
-        uint16x8_t vsum16_3 = vdupq_n_u16(0);
-#if HIGH_BIT_DEPTH
-        for (; (x + 16) <= lx; x += 16)
-        {
-            uint16x8x2_t p1 = vld1q_u16_x2(&pix1[x]);
-            uint16x8x2_t p2 = vld1q_u16_x2(&pix2[x]);
-            uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]);
-            uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]);
-            uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]);
-            vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]);
-            vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]);
-            vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]);
-            vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]);
-            vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]);
-            vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]);
-            vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]);
-            vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]);
-        }
-        if (lx & 8)
-        {
-            uint16x8_t p1 = vld1q_u16(pix1 + x);
-            uint16x8_t p2 = vld1q_u16(pix2 + x);
-            uint16x8_t p3 = vld1q_u16(pix3 + x);
-            uint16x8_t p4 = vld1q_u16(pix4 + x);
-            uint16x8_t p5 = vld1q_u16(pix5 + x);
-            vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
-            vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
-            vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
-            vsum16_3 = vabaq_u16(vsum16_3, p1, p5);
-            x += 8;
-        }
-        if (lx & 4)
-        {
-            /* This is equivalent to getting the absolute difference of pix1[x] with each of
-             * pix2 - pix5, then summing across the vector (4 values each) and adding the
-             * result to result. */
-            uint16x8_t p1 = vreinterpretq_u16_u64(
-                vld1q_dup_u64((uint64_t *)&pix1[x]));
-            uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x),
-                                           vld1_u16(pix3 + x));
-            uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x),
-                                           vld1_u16(pix5 + x));
-
-            uint16x8_t a = vabdq_u16(p1, p2_3);
-            uint16x8_t b = vabdq_u16(p1, p4_5);
-
-            result = vpadalq_u16(result, vpaddq_u16(a, b));
-            x += 4;
-        }
-        if (lx >= 4)
-        {
-            /* This is equivalent to adding across each of the sum vectors and then adding
-             * to result. */
-            uint32x4_t sum01 = vpaddlq_u16(vpaddq_u16(vsum16_0, vsum16_1));
-            uint32x4_t sum23 = vpaddlq_u16(vpaddq_u16(vsum16_2, vsum16_3));
-            result = vaddq_u32(result, vpaddq_u32(sum01, sum23));
-        }
-
-#else
-
-        for (; (x + 16) <= lx; x += 16)
-        {
-            uint8x16_t p1 = vld1q_u8(pix1 + x);
-            uint8x16_t p2 = vld1q_u8(pix2 + x);
-            uint8x16_t p3 = vld1q_u8(pix3 + x);
-            uint8x16_t p4 = vld1q_u8(pix4 + x);
-            uint8x16_t p5 = vld1q_u8(pix5 + x);
-            vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2));
-            vsum16_0 = vabal_high_u8(vsum16_0, p1, p2);
-            vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3));
-            vsum16_1 = vabal_high_u8(vsum16_1, p1, p3);
-            vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4));
-            vsum16_2 = vabal_high_u8(vsum16_2, p1, p4);
-            vsum16_3 = vabal_u8(vsum16_3, vget_low_u8(p1), vget_low_u8(p5));
-            vsum16_3 = vabal_high_u8(vsum16_3, p1, p5);
-        }
-        if (lx & 8)
-        {
-            uint8x8_t p1 = vld1_u8(pix1 + x);
-            uint8x8_t p2 = vld1_u8(pix2 + x);
-            uint8x8_t p3 = vld1_u8(pix3 + x);
-            uint8x8_t p4 = vld1_u8(pix4 + x);
-            uint8x8_t p5 = vld1_u8(pix5 + x);
-            vsum16_0 = vabal_u8(vsum16_0, p1, p2);
-            vsum16_1 = vabal_u8(vsum16_1, p1, p3);
-            vsum16_2 = vabal_u8(vsum16_2, p1, p4);
-            vsum16_3 = vabal_u8(vsum16_3, p1, p5);
-            x += 8;
-        }
-        if (lx & 4)
-        {
-            uint8x16_t p1 = vreinterpretq_u8_u32(
-                vld1q_dup_u32((uint32_t *)&pix1[x]));
-
-            uint32x4_t p_x4 = vdupq_n_u32(0);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix2[x], p_x4, 0);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix3[x], p_x4, 1);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2);
-            p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3);
-
-            uint16x8_t sum = vabdl_u8(vget_low_u8(p1),
-                                      vget_low_u8(vreinterpretq_u8_u32(p_x4)));
-            uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4));
-
-            result = vpadalq_u16(result, vpaddq_u16(sum, sum2));
-        }
-        if (lx >= 4)
-        {
-            result[0] += vaddvq_u16(vsum16_0);
-            result[1] += vaddvq_u16(vsum16_1);
-            result[2] += vaddvq_u16(vsum16_2);
-            result[3] += vaddvq_u16(vsum16_3);
-        }
-
-#endif
-        if (lx & 3) for (; x < lx; x++)
-        {
-            result[0] += abs(pix1[x] - pix2[x]);
-            result[1] += abs(pix1[x] - pix3[x]);
-            result[2] += abs(pix1[x] - pix4[x]);
-            result[3] += abs(pix1[x] - pix5[x]);
-        }
-
-        pix1 += FENC_STRIDE;
-        pix2 += frefstride;
-        pix3 += frefstride;
-        pix4 += frefstride;
-        pix5 += frefstride;
-    }
-
-    vst1q_s32(res, vreinterpretq_s32_u32(result));
-}
-
 
 template<int bx, int by>
 void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
@@ -1541,7 +1396,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
-    p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
 
@@ -1555,7 +1409,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \
-    p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \
     p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
 #endif // !(HIGH_BIT_DEPTH)
@@ -1600,12 +1453,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
-#if !(HIGH_BIT_DEPTH)
-    p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>;
-    p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>;
-    p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>;
-#endif // !(HIGH_BIT_DEPTH)
-
     p.pu[LUMA_4x4].satd   = pixel_satd_4x4_neon;
     p.pu[LUMA_8x4].satd   = pixel_satd_8x4_neon;
     
-- 
2.39.3 (Apple Git-146)



More information about the x265-devel mailing list