[x265] [PATCH 07/11] AArch64: Delete sse_neon implementation
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Dec 10 16:02:59 UTC 2024
The Neon intrinsics implementation of sse is not used anymore given
that a faster asm implementation exists for both standard and high
bit-depth. Delete the sse_neon function.
Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238
---
source/common/aarch64/pixel-prim.cpp | 61 ----------------------------
1 file changed, 61 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index dd3fd4637..63b30604c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
}
-template<int lx, int ly, class T1, class T2>
-sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
-{
- sse_t sum = 0;
-
- int32x4_t vsum1 = vdupq_n_s32(0);
- int32x4_t vsum2 = vdupq_n_s32(0);
- for (int y = 0; y < ly; y++)
- {
- int x = 0;
- for (; (x + 8) <= lx; x += 8)
- {
- int16x8_t tmp;
- if (sizeof(T1) == 2 && sizeof(T2) == 2)
- {
- // We have to cast to the 'real' type so that this block
- // will compile for both low and high bitdepth.
- uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x);
- uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x);
- tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2));
- }
- else if (sizeof(T1) == 1 && sizeof(T2) == 1)
- {
- // We have to cast to the 'real' type so that this block
- // will compile for both low and high bitdepth.
- uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x);
- uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x);
- tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2));
- }
- else
- {
- X265_CHECK(false, "unsupported sse");
- }
- vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
- vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
- }
- for (; x < lx; x++)
- {
- int tmp = pix1[x] - pix2[x];
- sum += (tmp * tmp);
- }
-
- if (sizeof(T1) == 2 && sizeof(T2) == 2)
- {
- int32x4_t vsum = vaddq_s32(vsum1, vsum2);
- sum += vaddvq_s32(vsum);
- vsum1 = vsum2 = vdupq_n_s32(0);
- }
-
- pix1 += stride_pix1;
- pix2 += stride_pix2;
- }
-
- int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-
- return sum + vaddvq_s32(vsum);
-}
-
-
template<int bx, int by>
void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
{
@@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_420(W, H) \
- p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
@@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_422(W, H) \
- p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 4db045e36c00550dae8eced28bf7daf5c01e99d1 Mon Sep 17 00:00:00 2001
Message-Id: <4db045e36c00550dae8eced28bf7daf5c01e99d1.1733846134.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1733846134.git.gerdazsejke.more at arm.com>
References: <cover.1733846134.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sat, 7 Dec 2024 16:20:38 +0100
Subject: [PATCH 07/11] AArch64: Delete sse_neon implementation
The Neon intrinsics implementation of sse is not used anymore given
that a faster asm implementation exists for both standard and high
bit-depth. Delete the sse_neon function.
Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238
---
source/common/aarch64/pixel-prim.cpp | 61 ----------------------------
1 file changed, 61 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index dd3fd4637..63b30604c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
}
-template<int lx, int ly, class T1, class T2>
-sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2)
-{
- sse_t sum = 0;
-
- int32x4_t vsum1 = vdupq_n_s32(0);
- int32x4_t vsum2 = vdupq_n_s32(0);
- for (int y = 0; y < ly; y++)
- {
- int x = 0;
- for (; (x + 8) <= lx; x += 8)
- {
- int16x8_t tmp;
- if (sizeof(T1) == 2 && sizeof(T2) == 2)
- {
- // We have to cast to the 'real' type so that this block
- // will compile for both low and high bitdepth.
- uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x);
- uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x);
- tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2));
- }
- else if (sizeof(T1) == 1 && sizeof(T2) == 1)
- {
- // We have to cast to the 'real' type so that this block
- // will compile for both low and high bitdepth.
- uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x);
- uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x);
- tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2));
- }
- else
- {
- X265_CHECK(false, "unsupported sse");
- }
- vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp));
- vsum2 = vmlal_high_s16(vsum2, tmp, tmp);
- }
- for (; x < lx; x++)
- {
- int tmp = pix1[x] - pix2[x];
- sum += (tmp * tmp);
- }
-
- if (sizeof(T1) == 2 && sizeof(T2) == 2)
- {
- int32x4_t vsum = vaddq_s32(vsum1, vsum2);
- sum += vaddvq_s32(vsum);
- vsum1 = vsum2 = vdupq_n_s32(0);
- }
-
- pix1 += stride_pix1;
- pix2 += stride_pix2;
- }
-
- int32x4_t vsum = vaddq_s32(vsum1, vsum2);
-
- return sum + vaddvq_s32(vsum);
-}
-
-
template<int bx, int by>
void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
{
@@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_420(W, H) \
- p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
@@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_422(W, H) \
- p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list