[x265] [PATCH 2/3] AArch64: Simplify SBD and HBD psyCost_pp_neon
Li Zhang
li.zhang2 at arm.com
Wed Apr 30 18:17:57 UTC 2025
Simplify the psyCost_pp_neon implementation by removing unnecessary
computation of absolute differences against zeroBuf in the calls to the
SA8D/SATD and SAD functions. These are replaced with a combined function
where both Hadamard 4/8 and summations are calculated and subtracted.
Also remove the assembly implementation for 4x4 block size which is
slower than the simplified intrinsics.
---
source/common/aarch64/asm-primitives.cpp | 3 -
source/common/aarch64/pixel-prim.cpp | 139 +++++++++++++++++++++--
source/common/aarch64/pixel-util.S | 82 -------------
3 files changed, 127 insertions(+), 97 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4d2c575d1..e16150d4f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -671,9 +671,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].normFact = PFX(normFact32_neon);
p.cu[BLOCK_64x64].normFact = PFX(normFact64_neon);
- // psy_cost_pp
- p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
#if !defined(__APPLE__)
p.scanPosLast = PFX(scanPosLast_neon);
#endif
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 67c388b59..b08f98457 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1393,11 +1393,131 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
}
}
+#if HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint16_t *source, intptr_t sstride)
+{
+ uint16x8_t s[8];
+ load_u16x8xn<8>(source, sstride, s);
+
+ int16x8_t in[8], temp[8];
+
+ in[0] = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+ in[1] = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+ in[2] = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+ in[3] = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+ in[4] = vreinterpretq_s16_u16(vsubq_u16(s[0], s[1]));
+ in[5] = vreinterpretq_s16_u16(vsubq_u16(s[2], s[3]));
+ in[6] = vreinterpretq_s16_u16(vsubq_u16(s[4], s[5]));
+ in[7] = vreinterpretq_s16_u16(vsubq_u16(s[6], s[7]));
+
+ hadamard_4_v(in, temp);
+ hadamard_4_v(in + 4, temp + 4);
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddlvq_s16(temp[0]) >> 2;
+
+#if X265_DEPTH == 10
+ uint16x8_t sa8_out[4];
+
+ hadamard_8_h(temp, sa8_out);
+
+ uint32x4_t res = vpaddlq_u16(sa8_out[0]);
+ res = vpadalq_u16(res, sa8_out[1]);
+ res = vpadalq_u16(res, sa8_out[2]);
+ res = vpadalq_u16(res, sa8_out[3]);
+#else // X265_DEPTH == 12
+ uint32x4_t sa8_out[4];
+
+ hadamard_8_h(temp, sa8_out);
+
+ sa8_out[0] = vaddq_u32(sa8_out[0], sa8_out[1]);
+ sa8_out[2] = vaddq_u32(sa8_out[2], sa8_out[3]);
+ uint32x4_t res = vaddq_u32(sa8_out[0], sa8_out[2]);
+#endif // X265_DEPTH == 10
+
+ int sa8 = (vaddvq_u32(res) + 1) >> 1;
+
+ return sa8 - sum;
+}
+
+#else // !HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint8_t *source, intptr_t sstride)
+{
+ uint8x8_t s[8];
+ load_u8x8xn<8>(source, sstride, s);
+
+ int16x8_t in[8], temp[8];
+
+ in[0] = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+ in[1] = vreinterpretq_s16_u16(vaddl_u8(s[2], s[3]));
+ in[2] = vreinterpretq_s16_u16(vaddl_u8(s[4], s[5]));
+ in[3] = vreinterpretq_s16_u16(vaddl_u8(s[6], s[7]));
+ in[4] = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+ in[5] = vreinterpretq_s16_u16(vsubl_u8(s[2], s[3]));
+ in[6] = vreinterpretq_s16_u16(vsubl_u8(s[4], s[5]));
+ in[7] = vreinterpretq_s16_u16(vsubl_u8(s[6], s[7]));
+
+ hadamard_4_v(in, temp);
+ hadamard_4_v(in + 4, temp + 4);
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddvq_s16(temp[0]) >> 2;
+
+ uint16x8_t sa8_out[4];
+ hadamard_8_h(temp, sa8_out);
+
+ uint16x8_t res = vaddq_u16(sa8_out[0], sa8_out[1]);
+ res = vaddq_u16(res, sa8_out[2]);
+ res = vaddq_u16(res, sa8_out[3]);
+
+ int sa8 = (vaddlvq_u16(res) + 1) >> 1;
+
+ return sa8 - sum;
+}
+
+#endif // HIGH_BIT_DEPTH
+
+static inline int calc_energy_4x4(const pixel *source, intptr_t sstride)
+{
+#if HIGH_BIT_DEPTH
+ uint16x4_t s[4];
+ load_u16x4xn<4>(source, sstride, s);
+
+ uint16x8_t s01 = vcombine_u16(s[0], s[1]);
+ uint16x8_t s23 = vcombine_u16(s[2], s[3]);
+
+ int16x8_t s01_23 = vreinterpretq_s16_u16(vaddq_u16(s01, s23));
+ int16x8_t d01_23 = vreinterpretq_s16_u16(vsubq_u16(s01, s23));
+#else
+ uint8x8_t s[2];
+ s[0] = load_u8x4x2(source + 0 * sstride, sstride);
+ s[1] = load_u8x4x2(source + 2 * sstride, sstride);
+
+ int16x8_t s01_23 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+ int16x8_t d01_23 = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+#endif
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddvq_u16(vreinterpretq_u16_s16(s01_23)) >> 2;
+
+ int16x8_t t0, t1;
+
+ transpose_s16_s64x2(&t0, &t1, s01_23, d01_23);
+ sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+ transpose_s16_s16x2(&t0, &t1, s01_23, d01_23);
+ sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+ transpose_s16_s32x2(&t0, &t1, s01_23, d01_23);
+
+ int sat = vaddvq_u16(max_abs_s16(t0, t1));
+
+ return sat - sum;
+}
+
template<int size>
int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
{
- static pixel zeroBuf[8] /* = { 0 } */;
-
if (size)
{
int dim = 1 << (size + 2);
@@ -1406,11 +1526,8 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
{
for (int j = 0; j < dim; j += 8)
{
- /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
- int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
- (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
- int reconEnergy = pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
- (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
+ int sourceEnergy = calc_energy_8x8(source + i * sstride + j, sstride);
+ int reconEnergy = calc_energy_8x8(recon + i * rstride + j, rstride);
totEnergy += abs(sourceEnergy - reconEnergy);
}
@@ -1419,11 +1536,9 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
}
else
{
- /* 4x4 is too small for sa8d */
- int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
- 0) >> 2);
- int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
- 0) >> 2);
+ int sourceEnergy = calc_energy_4x4(source, sstride);
+ int reconEnergy = calc_energy_4x4(recon, rstride);
+
return abs(sourceEnergy - reconEnergy);
}
}
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index e189fdcd7..480278e5e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -724,88 +724,6 @@ function PFX(ssim_4x4x2_core_neon)
ret
endfunc
-// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
-function PFX(psyCost_4x4_neon)
- ld1r {v4.2s}, [x0], x1
- ld1r {v5.2s}, [x0], x1
- ld1 {v4.s}[1], [x0], x1
- ld1 {v5.s}[1], [x0], x1
-
- ld1r {v6.2s}, [x2], x3
- ld1r {v7.2s}, [x2], x3
- ld1 {v6.s}[1], [x2], x3
- ld1 {v7.s}[1], [x2], x3
-
- uaddl v2.8h, v4.8b, v5.8b
- usubl v3.8h, v4.8b, v5.8b
- uaddl v18.8h, v6.8b, v7.8b
- usubl v19.8h, v6.8b, v7.8b
-
- mov v20.d[0], v2.d[1]
- add v0.4h, v2.4h, v20.4h
- sub v1.4h, v2.4h, v20.4h
- mov v21.d[0], v3.d[1]
- add v22.4h, v3.4h, v21.4h
- sub v23.4h, v3.4h, v21.4h
-
- mov v24.d[0], v18.d[1]
- add v16.4h, v18.4h, v24.4h
- sub v17.4h, v18.4h, v24.4h
- mov v25.d[0], v19.d[1]
- add v26.4h, v19.4h, v25.4h
- sub v27.4h, v19.4h, v25.4h
-
- mov v0.d[1], v22.d[0]
- mov v1.d[1], v23.d[0]
- trn1 v22.8h, v0.8h, v1.8h
- trn2 v23.8h, v0.8h, v1.8h
- mov v16.d[1], v26.d[0]
- mov v17.d[1], v27.d[0]
- trn1 v26.8h, v16.8h, v17.8h
- trn2 v27.8h, v16.8h, v17.8h
-
- add v2.8h, v22.8h, v23.8h
- sub v3.8h, v22.8h, v23.8h
- add v18.8h, v26.8h, v27.8h
- sub v19.8h, v26.8h, v27.8h
-
- uaddl v20.8h, v4.8b, v5.8b
- uaddl v21.8h, v6.8b, v7.8b
-
- trn1 v0.4s, v2.4s, v3.4s
- trn2 v1.4s, v2.4s, v3.4s
- trn1 v16.4s, v18.4s, v19.4s
- trn2 v17.4s, v18.4s, v19.4s
- abs v0.8h, v0.8h
- abs v16.8h, v16.8h
- abs v1.8h, v1.8h
- abs v17.8h, v17.8h
-
- uaddlv s20, v20.8h
- uaddlv s21, v21.8h
- mov v20.s[1], v21.s[0]
-
- smax v0.8h, v0.8h, v1.8h
- smax v16.8h, v16.8h, v17.8h
-
- trn1 v4.2d, v0.2d, v16.2d
- trn2 v5.2d, v0.2d, v16.2d
- add v0.8h, v4.8h, v5.8h
- mov v4.d[0], v0.d[1]
- uaddlv s0, v0.4h
- uaddlv s4, v4.4h
-
- ushr v20.2s, v20.2s, #2
- mov v0.s[1], v4.s[0]
- sub v0.2s, v0.2s, v20.2s
- mov w0, v0.s[0]
- mov w1, v0.s[1]
- subs w0, w0, w1
- cneg w0, w0, mi
-
- ret
-endfunc
-
// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
function PFX(quant_neon)
mov w9, #1
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 73c777cd626f7200fa25e271f7fa48abc0ceb3a9 Mon Sep 17 00:00:00 2001
Message-Id: <73c777cd626f7200fa25e271f7fa48abc0ceb3a9.1746034801.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1746034801.git.li.zhang2 at arm.com>
References: <cover.1746034801.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Wed, 30 Apr 2025 19:29:01 +0200
Subject: [PATCH 2/3] AArch64: Simplify SBD and HBD psyCost_pp_neon
Simplify the psyCost_pp_neon implementation by removing unnecessary
computation of absolute differences against zeroBuf in the calls to the
SA8D/SATD and SAD functions. These are replaced with a combined function
where both Hadamard 4/8 and summations are calculated and subtracted.
Also remove the assembly implementation for 4x4 block size which is
slower than the simplified intrinsics.
---
source/common/aarch64/asm-primitives.cpp | 3 -
source/common/aarch64/pixel-prim.cpp | 139 +++++++++++++++++++++--
source/common/aarch64/pixel-util.S | 82 -------------
3 files changed, 127 insertions(+), 97 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4d2c575d1..e16150d4f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -671,9 +671,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].normFact = PFX(normFact32_neon);
p.cu[BLOCK_64x64].normFact = PFX(normFact64_neon);
- // psy_cost_pp
- p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
#if !defined(__APPLE__)
p.scanPosLast = PFX(scanPosLast_neon);
#endif
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 67c388b59..b08f98457 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1393,11 +1393,131 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
}
}
+#if HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint16_t *source, intptr_t sstride)
+{
+ uint16x8_t s[8];
+ load_u16x8xn<8>(source, sstride, s);
+
+ int16x8_t in[8], temp[8];
+
+ in[0] = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+ in[1] = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+ in[2] = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+ in[3] = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+ in[4] = vreinterpretq_s16_u16(vsubq_u16(s[0], s[1]));
+ in[5] = vreinterpretq_s16_u16(vsubq_u16(s[2], s[3]));
+ in[6] = vreinterpretq_s16_u16(vsubq_u16(s[4], s[5]));
+ in[7] = vreinterpretq_s16_u16(vsubq_u16(s[6], s[7]));
+
+ hadamard_4_v(in, temp);
+ hadamard_4_v(in + 4, temp + 4);
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddlvq_s16(temp[0]) >> 2;
+
+#if X265_DEPTH == 10
+ uint16x8_t sa8_out[4];
+
+ hadamard_8_h(temp, sa8_out);
+
+ uint32x4_t res = vpaddlq_u16(sa8_out[0]);
+ res = vpadalq_u16(res, sa8_out[1]);
+ res = vpadalq_u16(res, sa8_out[2]);
+ res = vpadalq_u16(res, sa8_out[3]);
+#else // X265_DEPTH == 12
+ uint32x4_t sa8_out[4];
+
+ hadamard_8_h(temp, sa8_out);
+
+ sa8_out[0] = vaddq_u32(sa8_out[0], sa8_out[1]);
+ sa8_out[2] = vaddq_u32(sa8_out[2], sa8_out[3]);
+ uint32x4_t res = vaddq_u32(sa8_out[0], sa8_out[2]);
+#endif // X265_DEPTH == 10
+
+ int sa8 = (vaddvq_u32(res) + 1) >> 1;
+
+ return sa8 - sum;
+}
+
+#else // !HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint8_t *source, intptr_t sstride)
+{
+ uint8x8_t s[8];
+ load_u8x8xn<8>(source, sstride, s);
+
+ int16x8_t in[8], temp[8];
+
+ in[0] = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+ in[1] = vreinterpretq_s16_u16(vaddl_u8(s[2], s[3]));
+ in[2] = vreinterpretq_s16_u16(vaddl_u8(s[4], s[5]));
+ in[3] = vreinterpretq_s16_u16(vaddl_u8(s[6], s[7]));
+ in[4] = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+ in[5] = vreinterpretq_s16_u16(vsubl_u8(s[2], s[3]));
+ in[6] = vreinterpretq_s16_u16(vsubl_u8(s[4], s[5]));
+ in[7] = vreinterpretq_s16_u16(vsubl_u8(s[6], s[7]));
+
+ hadamard_4_v(in, temp);
+ hadamard_4_v(in + 4, temp + 4);
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddvq_s16(temp[0]) >> 2;
+
+ uint16x8_t sa8_out[4];
+ hadamard_8_h(temp, sa8_out);
+
+ uint16x8_t res = vaddq_u16(sa8_out[0], sa8_out[1]);
+ res = vaddq_u16(res, sa8_out[2]);
+ res = vaddq_u16(res, sa8_out[3]);
+
+ int sa8 = (vaddlvq_u16(res) + 1) >> 1;
+
+ return sa8 - sum;
+}
+
+#endif // HIGH_BIT_DEPTH
+
+static inline int calc_energy_4x4(const pixel *source, intptr_t sstride)
+{
+#if HIGH_BIT_DEPTH
+ uint16x4_t s[4];
+ load_u16x4xn<4>(source, sstride, s);
+
+ uint16x8_t s01 = vcombine_u16(s[0], s[1]);
+ uint16x8_t s23 = vcombine_u16(s[2], s[3]);
+
+ int16x8_t s01_23 = vreinterpretq_s16_u16(vaddq_u16(s01, s23));
+ int16x8_t d01_23 = vreinterpretq_s16_u16(vsubq_u16(s01, s23));
+#else
+ uint8x8_t s[2];
+ s[0] = load_u8x4x2(source + 0 * sstride, sstride);
+ s[1] = load_u8x4x2(source + 2 * sstride, sstride);
+
+ int16x8_t s01_23 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+ int16x8_t d01_23 = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+#endif
+
+ // The first line after the vertical hadamard transform contains the sum of coefficients.
+ int sum = vaddvq_u16(vreinterpretq_u16_s16(s01_23)) >> 2;
+
+ int16x8_t t0, t1;
+
+ transpose_s16_s64x2(&t0, &t1, s01_23, d01_23);
+ sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+ transpose_s16_s16x2(&t0, &t1, s01_23, d01_23);
+ sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+ transpose_s16_s32x2(&t0, &t1, s01_23, d01_23);
+
+ int sat = vaddvq_u16(max_abs_s16(t0, t1));
+
+ return sat - sum;
+}
+
template<int size>
int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
{
- static pixel zeroBuf[8] /* = { 0 } */;
-
if (size)
{
int dim = 1 << (size + 2);
@@ -1406,11 +1526,8 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
{
for (int j = 0; j < dim; j += 8)
{
- /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
- int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
- (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
- int reconEnergy = pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
- (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
+ int sourceEnergy = calc_energy_8x8(source + i * sstride + j, sstride);
+ int reconEnergy = calc_energy_8x8(recon + i * rstride + j, rstride);
totEnergy += abs(sourceEnergy - reconEnergy);
}
@@ -1419,11 +1536,9 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
}
else
{
- /* 4x4 is too small for sa8d */
- int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
- 0) >> 2);
- int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
- 0) >> 2);
+ int sourceEnergy = calc_energy_4x4(source, sstride);
+ int reconEnergy = calc_energy_4x4(recon, rstride);
+
return abs(sourceEnergy - reconEnergy);
}
}
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index e189fdcd7..480278e5e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -724,88 +724,6 @@ function PFX(ssim_4x4x2_core_neon)
ret
endfunc
-// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
-function PFX(psyCost_4x4_neon)
- ld1r {v4.2s}, [x0], x1
- ld1r {v5.2s}, [x0], x1
- ld1 {v4.s}[1], [x0], x1
- ld1 {v5.s}[1], [x0], x1
-
- ld1r {v6.2s}, [x2], x3
- ld1r {v7.2s}, [x2], x3
- ld1 {v6.s}[1], [x2], x3
- ld1 {v7.s}[1], [x2], x3
-
- uaddl v2.8h, v4.8b, v5.8b
- usubl v3.8h, v4.8b, v5.8b
- uaddl v18.8h, v6.8b, v7.8b
- usubl v19.8h, v6.8b, v7.8b
-
- mov v20.d[0], v2.d[1]
- add v0.4h, v2.4h, v20.4h
- sub v1.4h, v2.4h, v20.4h
- mov v21.d[0], v3.d[1]
- add v22.4h, v3.4h, v21.4h
- sub v23.4h, v3.4h, v21.4h
-
- mov v24.d[0], v18.d[1]
- add v16.4h, v18.4h, v24.4h
- sub v17.4h, v18.4h, v24.4h
- mov v25.d[0], v19.d[1]
- add v26.4h, v19.4h, v25.4h
- sub v27.4h, v19.4h, v25.4h
-
- mov v0.d[1], v22.d[0]
- mov v1.d[1], v23.d[0]
- trn1 v22.8h, v0.8h, v1.8h
- trn2 v23.8h, v0.8h, v1.8h
- mov v16.d[1], v26.d[0]
- mov v17.d[1], v27.d[0]
- trn1 v26.8h, v16.8h, v17.8h
- trn2 v27.8h, v16.8h, v17.8h
-
- add v2.8h, v22.8h, v23.8h
- sub v3.8h, v22.8h, v23.8h
- add v18.8h, v26.8h, v27.8h
- sub v19.8h, v26.8h, v27.8h
-
- uaddl v20.8h, v4.8b, v5.8b
- uaddl v21.8h, v6.8b, v7.8b
-
- trn1 v0.4s, v2.4s, v3.4s
- trn2 v1.4s, v2.4s, v3.4s
- trn1 v16.4s, v18.4s, v19.4s
- trn2 v17.4s, v18.4s, v19.4s
- abs v0.8h, v0.8h
- abs v16.8h, v16.8h
- abs v1.8h, v1.8h
- abs v17.8h, v17.8h
-
- uaddlv s20, v20.8h
- uaddlv s21, v21.8h
- mov v20.s[1], v21.s[0]
-
- smax v0.8h, v0.8h, v1.8h
- smax v16.8h, v16.8h, v17.8h
-
- trn1 v4.2d, v0.2d, v16.2d
- trn2 v5.2d, v0.2d, v16.2d
- add v0.8h, v4.8h, v5.8h
- mov v4.d[0], v0.d[1]
- uaddlv s0, v0.4h
- uaddlv s4, v4.4h
-
- ushr v20.2s, v20.2s, #2
- mov v0.s[1], v4.s[0]
- sub v0.2s, v0.2s, v20.2s
- mov w0, v0.s[0]
- mov w1, v0.s[1]
- subs w0, w0, w1
- cneg w0, w0, mi
-
- ret
-endfunc
-
// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
function PFX(quant_neon)
mov w9, #1
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list