[x265] [PATCH 07/11] AArch64: Delete sse_neon implementation

Tue Dec 10 16:02:59 UTC 2024

The Neon intrinsics implementation of sse is not used anymore given
that a faster asm implementation exists for both standard and high
bit-depth. Delete the sse_neon function.

Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238
---
 source/common/aarch64/pixel-prim.cpp | 61 ----------------------------
 1 file changed, 61 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index dd3fd4637..63b30604c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
 }
 
 
-template<int lx, int ly, class T1, class T2>
-sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
-{
-    sse_t sum = 0;
-
-    int32x4_t vsum1 = vdupq_n_s32(0);
-    int32x4_t vsum2 = vdupq_n_s32(0);
-    for (int y = 0; y < ly; y++)
-    {
-        int x = 0;
-        for (; (x + 8) <= lx; x += 8)
-        {
-            int16x8_t tmp;
-            if (sizeof(T1) == 2 && sizeof(T2) == 2)
-            {
-                // We have to cast to the 'real' type so that this block
-                // will compile for both low and high bitdepth.
-                uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x);
-                uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x);
-                tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2));
-            }
-            else if (sizeof(T1) == 1 && sizeof(T2) == 1)
-            {
-                // We have to cast to the 'real' type so that this block
-                // will compile for both low and high bitdepth.
-                uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x);
-                uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x);
-                tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2));
-            }
-            else
-            {
-                X265_CHECK(false, "unsupported sse");
-            }
-            vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
-            vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
-        }
-        for (; x < lx; x++)
-        {
-            int tmp = pix1[x] - pix2[x];
-            sum += (tmp * tmp);
-        }
-
-        if (sizeof(T1) == 2 && sizeof(T2) == 2)
-        {
-            int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-            sum += vaddvq_s32(vsum);
-            vsum1 = vsum2 = vdupq_n_s32(0);
-        }
-
-        pix1 += stride_pix1;
-        pix2 += stride_pix2;
-    }
-
-    int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-
-    return sum + vaddvq_s32(vsum);
-}
-
-
 template<int bx, int by>
 void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
 {
@@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
 #define CHROMA_CU_420(W, H) \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>;  \
@@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
 #define CHROMA_CU_422(W, H) \
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>;  \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From 4db045e36c00550dae8eced28bf7daf5c01e99d1 Mon Sep 17 00:00:00 2001
Message-Id: <4db045e36c00550dae8eced28bf7daf5c01e99d1.1733846134.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1733846134.git.gerdazsejke.more at arm.com>
References: <cover.1733846134.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sat, 7 Dec 2024 16:20:38 +0100
Subject: [PATCH 07/11] AArch64: Delete sse_neon implementation

The Neon intrinsics implementation of sse is not used anymore given
that a faster asm implementation exists for both standard and high
bit-depth. Delete the sse_neon function.

Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238
---
 source/common/aarch64/pixel-prim.cpp | 61 ----------------------------
 1 file changed, 61 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index dd3fd4637..63b30604c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
 }
 
 
-template<int lx, int ly, class T1, class T2>
-sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
-{
-    sse_t sum = 0;
-
-    int32x4_t vsum1 = vdupq_n_s32(0);
-    int32x4_t vsum2 = vdupq_n_s32(0);
-    for (int y = 0; y < ly; y++)
-    {
-        int x = 0;
-        for (; (x + 8) <= lx; x += 8)
-        {
-            int16x8_t tmp;
-            if (sizeof(T1) == 2 && sizeof(T2) == 2)
-            {
-                // We have to cast to the 'real' type so that this block
-                // will compile for both low and high bitdepth.
-                uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x);
-                uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x);
-                tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2));
-            }
-            else if (sizeof(T1) == 1 && sizeof(T2) == 1)
-            {
-                // We have to cast to the 'real' type so that this block
-                // will compile for both low and high bitdepth.
-                uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x);
-                uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x);
-                tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2));
-            }
-            else
-            {
-                X265_CHECK(false, "unsupported sse");
-            }
-            vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
-            vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
-        }
-        for (; x < lx; x++)
-        {
-            int tmp = pix1[x] - pix2[x];
-            sum += (tmp * tmp);
-        }
-
-        if (sizeof(T1) == 2 && sizeof(T2) == 2)
-        {
-            int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-            sum += vaddvq_s32(vsum);
-            vsum1 = vsum2 = vdupq_n_s32(0);
-        }
-
-        pix1 += stride_pix1;
-        pix2 += stride_pix2;
-    }
-
-    int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-
-    return sum + vaddvq_s32(vsum);
-}
-
-
 template<int bx, int by>
 void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
 {
@@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
 #define CHROMA_CU_420(W, H) \
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>;  \
@@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
 #define CHROMA_CU_422(W, H) \
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp  = sse_neon<W, H, pixel, pixel>;  \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
-- 
2.39.5 (Apple Git-154)