[x265] [PATCH 2/3] AArch64: Simplify SBD and HBD psyCost_pp_neon

Wed Apr 30 18:17:57 UTC 2025

Simplify the psyCost_pp_neon implementation by removing unnecessary
computation of absolute differences against zeroBuf in the calls to the
SA8D/SATD and SAD functions. These are replaced with a combined function
where both Hadamard 4/8 and summations are calculated and subtracted.

Also remove the assembly implementation for 4x4 block size which is
slower than the simplified intrinsics.
---
 source/common/aarch64/asm-primitives.cpp |   3 -
 source/common/aarch64/pixel-prim.cpp     | 139 +++++++++++++++++++++--
 source/common/aarch64/pixel-util.S       |  82 -------------
 3 files changed, 127 insertions(+), 97 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4d2c575d1..e16150d4f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -671,9 +671,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].normFact = PFX(normFact32_neon);
     p.cu[BLOCK_64x64].normFact = PFX(normFact64_neon);
 
-    // psy_cost_pp
-    p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
 #if !defined(__APPLE__)
     p.scanPosLast = PFX(scanPosLast_neon);
 #endif
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 67c388b59..b08f98457 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1393,11 +1393,131 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
     }
 }
 
+#if HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint16_t *source, intptr_t sstride)
+{
+    uint16x8_t s[8];
+    load_u16x8xn<8>(source, sstride, s);
+
+    int16x8_t in[8], temp[8];
+
+    in[0] = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+    in[1] = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+    in[2] = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+    in[3] = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+    in[4] = vreinterpretq_s16_u16(vsubq_u16(s[0], s[1]));
+    in[5] = vreinterpretq_s16_u16(vsubq_u16(s[2], s[3]));
+    in[6] = vreinterpretq_s16_u16(vsubq_u16(s[4], s[5]));
+    in[7] = vreinterpretq_s16_u16(vsubq_u16(s[6], s[7]));
+
+    hadamard_4_v(in, temp);
+    hadamard_4_v(in + 4, temp + 4);
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddlvq_s16(temp[0]) >> 2;
+
+#if X265_DEPTH == 10
+    uint16x8_t sa8_out[4];
+
+    hadamard_8_h(temp, sa8_out);
+
+    uint32x4_t res = vpaddlq_u16(sa8_out[0]);
+    res = vpadalq_u16(res, sa8_out[1]);
+    res = vpadalq_u16(res, sa8_out[2]);
+    res = vpadalq_u16(res, sa8_out[3]);
+#else // X265_DEPTH == 12
+    uint32x4_t sa8_out[4];
+
+    hadamard_8_h(temp, sa8_out);
+
+    sa8_out[0] = vaddq_u32(sa8_out[0], sa8_out[1]);
+    sa8_out[2] = vaddq_u32(sa8_out[2], sa8_out[3]);
+    uint32x4_t res = vaddq_u32(sa8_out[0], sa8_out[2]);
+#endif // X265_DEPTH == 10
+
+    int sa8 = (vaddvq_u32(res) + 1) >> 1;
+
+    return sa8 - sum;
+}
+
+#else // !HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint8_t *source, intptr_t sstride)
+{
+    uint8x8_t s[8];
+    load_u8x8xn<8>(source, sstride, s);
+
+    int16x8_t in[8], temp[8];
+
+    in[0] = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+    in[1] = vreinterpretq_s16_u16(vaddl_u8(s[2], s[3]));
+    in[2] = vreinterpretq_s16_u16(vaddl_u8(s[4], s[5]));
+    in[3] = vreinterpretq_s16_u16(vaddl_u8(s[6], s[7]));
+    in[4] = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+    in[5] = vreinterpretq_s16_u16(vsubl_u8(s[2], s[3]));
+    in[6] = vreinterpretq_s16_u16(vsubl_u8(s[4], s[5]));
+    in[7] = vreinterpretq_s16_u16(vsubl_u8(s[6], s[7]));
+
+    hadamard_4_v(in, temp);
+    hadamard_4_v(in + 4, temp + 4);
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddvq_s16(temp[0]) >> 2;
+
+    uint16x8_t sa8_out[4];
+    hadamard_8_h(temp, sa8_out);
+
+    uint16x8_t res = vaddq_u16(sa8_out[0], sa8_out[1]);
+    res = vaddq_u16(res, sa8_out[2]);
+    res = vaddq_u16(res, sa8_out[3]);
+
+    int sa8 = (vaddlvq_u16(res) + 1) >> 1;
+
+    return sa8 - sum;
+}
+
+#endif // HIGH_BIT_DEPTH
+
+static inline int calc_energy_4x4(const pixel *source, intptr_t sstride)
+{
+#if HIGH_BIT_DEPTH
+    uint16x4_t s[4];
+    load_u16x4xn<4>(source, sstride, s);
+
+    uint16x8_t s01 = vcombine_u16(s[0], s[1]);
+    uint16x8_t s23 = vcombine_u16(s[2], s[3]);
+
+    int16x8_t s01_23 = vreinterpretq_s16_u16(vaddq_u16(s01, s23));
+    int16x8_t d01_23 = vreinterpretq_s16_u16(vsubq_u16(s01, s23));
+#else
+    uint8x8_t s[2];
+    s[0] = load_u8x4x2(source + 0 * sstride, sstride);
+    s[1] = load_u8x4x2(source + 2 * sstride, sstride);
+
+    int16x8_t s01_23 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+    int16x8_t d01_23 = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+#endif
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddvq_u16(vreinterpretq_u16_s16(s01_23)) >> 2;
+
+    int16x8_t t0, t1;
+
+    transpose_s16_s64x2(&t0, &t1, s01_23, d01_23);
+    sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+    transpose_s16_s16x2(&t0, &t1, s01_23, d01_23);
+    sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+    transpose_s16_s32x2(&t0, &t1, s01_23, d01_23);
+
+    int sat = vaddvq_u16(max_abs_s16(t0, t1));
+
+    return sat - sum;
+}
+
 template<int size>
 int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
 {
-    static pixel zeroBuf[8] /* = { 0 } */;
-
     if (size)
     {
         int dim = 1 << (size + 2);
@@ -1406,11 +1526,8 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
         {
             for (int j = 0; j < dim; j += 8)
             {
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
-                int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
-                                   (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
-                int reconEnergy =  pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
-                                   (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
+                int sourceEnergy = calc_energy_8x8(source + i * sstride + j, sstride);
+                int reconEnergy = calc_energy_8x8(recon + i * rstride + j, rstride);
 
                 totEnergy += abs(sourceEnergy - reconEnergy);
             }
@@ -1419,11 +1536,9 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
     }
     else
     {
-        /* 4x4 is too small for sa8d */
-        int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
-                           0) >> 2);
-        int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
-                          0) >> 2);
+        int sourceEnergy = calc_energy_4x4(source, sstride);
+        int reconEnergy = calc_energy_4x4(recon, rstride);
+
         return abs(sourceEnergy - reconEnergy);
     }
 }
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index e189fdcd7..480278e5e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -724,88 +724,6 @@ function PFX(ssim_4x4x2_core_neon)
     ret
 endfunc
 
-// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
-function PFX(psyCost_4x4_neon)
-    ld1r            {v4.2s}, [x0], x1
-    ld1r            {v5.2s}, [x0], x1
-    ld1             {v4.s}[1], [x0], x1
-    ld1             {v5.s}[1], [x0], x1
-
-    ld1r            {v6.2s}, [x2], x3
-    ld1r            {v7.2s}, [x2], x3
-    ld1             {v6.s}[1], [x2], x3
-    ld1             {v7.s}[1], [x2], x3
-
-    uaddl           v2.8h, v4.8b, v5.8b
-    usubl           v3.8h, v4.8b, v5.8b
-    uaddl           v18.8h, v6.8b, v7.8b
-    usubl           v19.8h, v6.8b, v7.8b
-
-    mov             v20.d[0], v2.d[1]
-    add             v0.4h, v2.4h, v20.4h
-    sub             v1.4h, v2.4h, v20.4h
-    mov             v21.d[0], v3.d[1]
-    add             v22.4h, v3.4h, v21.4h
-    sub             v23.4h, v3.4h, v21.4h
-
-    mov             v24.d[0], v18.d[1]
-    add             v16.4h, v18.4h, v24.4h
-    sub             v17.4h, v18.4h, v24.4h
-    mov             v25.d[0], v19.d[1]
-    add             v26.4h, v19.4h, v25.4h
-    sub             v27.4h, v19.4h, v25.4h
-
-    mov             v0.d[1], v22.d[0]
-    mov             v1.d[1], v23.d[0]
-    trn1            v22.8h, v0.8h, v1.8h
-    trn2            v23.8h, v0.8h, v1.8h
-    mov             v16.d[1], v26.d[0]
-    mov             v17.d[1], v27.d[0]
-    trn1            v26.8h, v16.8h, v17.8h
-    trn2            v27.8h, v16.8h, v17.8h
-
-    add             v2.8h, v22.8h, v23.8h
-    sub             v3.8h, v22.8h, v23.8h
-    add             v18.8h, v26.8h, v27.8h
-    sub             v19.8h, v26.8h, v27.8h
-
-    uaddl           v20.8h, v4.8b, v5.8b
-    uaddl           v21.8h, v6.8b, v7.8b
-
-    trn1            v0.4s, v2.4s, v3.4s
-    trn2            v1.4s, v2.4s, v3.4s
-    trn1            v16.4s, v18.4s, v19.4s
-    trn2            v17.4s, v18.4s, v19.4s
-    abs             v0.8h, v0.8h
-    abs             v16.8h, v16.8h
-    abs             v1.8h, v1.8h
-    abs             v17.8h, v17.8h
-
-    uaddlv          s20, v20.8h
-    uaddlv          s21, v21.8h
-    mov             v20.s[1], v21.s[0]
-
-    smax            v0.8h, v0.8h, v1.8h
-    smax            v16.8h, v16.8h, v17.8h
-
-    trn1            v4.2d, v0.2d, v16.2d
-    trn2            v5.2d, v0.2d, v16.2d
-    add             v0.8h, v4.8h, v5.8h
-    mov             v4.d[0], v0.d[1]
-    uaddlv          s0, v0.4h
-    uaddlv          s4, v4.4h
-
-    ushr            v20.2s, v20.2s, #2
-    mov             v0.s[1], v4.s[0]
-    sub             v0.2s, v0.2s, v20.2s
-    mov             w0, v0.s[0]
-    mov             w1, v0.s[1]
-    subs            w0, w0, w1
-    cneg            w0, w0, mi
-
-    ret
-endfunc
-
 // uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
 function PFX(quant_neon)
     mov             w9, #1
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From 73c777cd626f7200fa25e271f7fa48abc0ceb3a9 Mon Sep 17 00:00:00 2001
Message-Id: <73c777cd626f7200fa25e271f7fa48abc0ceb3a9.1746034801.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1746034801.git.li.zhang2 at arm.com>
References: <cover.1746034801.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Wed, 30 Apr 2025 19:29:01 +0200
Subject: [PATCH 2/3] AArch64: Simplify SBD and HBD psyCost_pp_neon

Simplify the psyCost_pp_neon implementation by removing unnecessary
computation of absolute differences against zeroBuf in the calls to the
SA8D/SATD and SAD functions. These are replaced with a combined function
where both Hadamard 4/8 and summations are calculated and subtracted.

Also remove the assembly implementation for 4x4 block size which is
slower than the simplified intrinsics.
---
 source/common/aarch64/asm-primitives.cpp |   3 -
 source/common/aarch64/pixel-prim.cpp     | 139 +++++++++++++++++++++--
 source/common/aarch64/pixel-util.S       |  82 -------------
 3 files changed, 127 insertions(+), 97 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4d2c575d1..e16150d4f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -671,9 +671,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].normFact = PFX(normFact32_neon);
     p.cu[BLOCK_64x64].normFact = PFX(normFact64_neon);
 
-    // psy_cost_pp
-    p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
 #if !defined(__APPLE__)
     p.scanPosLast = PFX(scanPosLast_neon);
 #endif
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 67c388b59..b08f98457 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1393,11 +1393,131 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
     }
 }
 
+#if HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint16_t *source, intptr_t sstride)
+{
+    uint16x8_t s[8];
+    load_u16x8xn<8>(source, sstride, s);
+
+    int16x8_t in[8], temp[8];
+
+    in[0] = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+    in[1] = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+    in[2] = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+    in[3] = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+    in[4] = vreinterpretq_s16_u16(vsubq_u16(s[0], s[1]));
+    in[5] = vreinterpretq_s16_u16(vsubq_u16(s[2], s[3]));
+    in[6] = vreinterpretq_s16_u16(vsubq_u16(s[4], s[5]));
+    in[7] = vreinterpretq_s16_u16(vsubq_u16(s[6], s[7]));
+
+    hadamard_4_v(in, temp);
+    hadamard_4_v(in + 4, temp + 4);
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddlvq_s16(temp[0]) >> 2;
+
+#if X265_DEPTH == 10
+    uint16x8_t sa8_out[4];
+
+    hadamard_8_h(temp, sa8_out);
+
+    uint32x4_t res = vpaddlq_u16(sa8_out[0]);
+    res = vpadalq_u16(res, sa8_out[1]);
+    res = vpadalq_u16(res, sa8_out[2]);
+    res = vpadalq_u16(res, sa8_out[3]);
+#else // X265_DEPTH == 12
+    uint32x4_t sa8_out[4];
+
+    hadamard_8_h(temp, sa8_out);
+
+    sa8_out[0] = vaddq_u32(sa8_out[0], sa8_out[1]);
+    sa8_out[2] = vaddq_u32(sa8_out[2], sa8_out[3]);
+    uint32x4_t res = vaddq_u32(sa8_out[0], sa8_out[2]);
+#endif // X265_DEPTH == 10
+
+    int sa8 = (vaddvq_u32(res) + 1) >> 1;
+
+    return sa8 - sum;
+}
+
+#else // !HIGH_BIT_DEPTH
+static inline int calc_energy_8x8(const uint8_t *source, intptr_t sstride)
+{
+    uint8x8_t s[8];
+    load_u8x8xn<8>(source, sstride, s);
+
+    int16x8_t in[8], temp[8];
+
+    in[0] = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+    in[1] = vreinterpretq_s16_u16(vaddl_u8(s[2], s[3]));
+    in[2] = vreinterpretq_s16_u16(vaddl_u8(s[4], s[5]));
+    in[3] = vreinterpretq_s16_u16(vaddl_u8(s[6], s[7]));
+    in[4] = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+    in[5] = vreinterpretq_s16_u16(vsubl_u8(s[2], s[3]));
+    in[6] = vreinterpretq_s16_u16(vsubl_u8(s[4], s[5]));
+    in[7] = vreinterpretq_s16_u16(vsubl_u8(s[6], s[7]));
+
+    hadamard_4_v(in, temp);
+    hadamard_4_v(in + 4, temp + 4);
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddvq_s16(temp[0]) >> 2;
+
+    uint16x8_t sa8_out[4];
+    hadamard_8_h(temp, sa8_out);
+
+    uint16x8_t res = vaddq_u16(sa8_out[0], sa8_out[1]);
+    res = vaddq_u16(res, sa8_out[2]);
+    res = vaddq_u16(res, sa8_out[3]);
+
+    int sa8 = (vaddlvq_u16(res) + 1) >> 1;
+
+    return sa8 - sum;
+}
+
+#endif // HIGH_BIT_DEPTH
+
+static inline int calc_energy_4x4(const pixel *source, intptr_t sstride)
+{
+#if HIGH_BIT_DEPTH
+    uint16x4_t s[4];
+    load_u16x4xn<4>(source, sstride, s);
+
+    uint16x8_t s01 = vcombine_u16(s[0], s[1]);
+    uint16x8_t s23 = vcombine_u16(s[2], s[3]);
+
+    int16x8_t s01_23 = vreinterpretq_s16_u16(vaddq_u16(s01, s23));
+    int16x8_t d01_23 = vreinterpretq_s16_u16(vsubq_u16(s01, s23));
+#else
+    uint8x8_t s[2];
+    s[0] = load_u8x4x2(source + 0 * sstride, sstride);
+    s[1] = load_u8x4x2(source + 2 * sstride, sstride);
+
+    int16x8_t s01_23 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[1]));
+    int16x8_t d01_23 = vreinterpretq_s16_u16(vsubl_u8(s[0], s[1]));
+#endif
+
+    // The first line after the vertical hadamard transform contains the sum of coefficients.
+    int sum = vaddvq_u16(vreinterpretq_u16_s16(s01_23)) >> 2;
+
+    int16x8_t t0, t1;
+
+    transpose_s16_s64x2(&t0, &t1, s01_23, d01_23);
+    sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+    transpose_s16_s16x2(&t0, &t1, s01_23, d01_23);
+    sumsubq_s16(&s01_23, &d01_23, t0, t1);
+
+    transpose_s16_s32x2(&t0, &t1, s01_23, d01_23);
+
+    int sat = vaddvq_u16(max_abs_s16(t0, t1));
+
+    return sat - sum;
+}
+
 template<int size>
 int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, intptr_t rstride)
 {
-    static pixel zeroBuf[8] /* = { 0 } */;
-
     if (size)
     {
         int dim = 1 << (size + 2);
@@ -1406,11 +1526,8 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
         {
             for (int j = 0; j < dim; j += 8)
             {
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
-                int sourceEnergy = pixel_sa8d_8x8_neon(source + i * sstride + j, sstride, zeroBuf, 0) -
-                                   (sad_pp_neon<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
-                int reconEnergy =  pixel_sa8d_8x8_neon(recon + i * rstride + j, rstride, zeroBuf, 0) -
-                                   (sad_pp_neon<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
+                int sourceEnergy = calc_energy_8x8(source + i * sstride + j, sstride);
+                int reconEnergy = calc_energy_8x8(recon + i * rstride + j, rstride);
 
                 totEnergy += abs(sourceEnergy - reconEnergy);
             }
@@ -1419,11 +1536,9 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
     }
     else
     {
-        /* 4x4 is too small for sa8d */
-        int sourceEnergy = pixel_satd_4x4_neon(source, sstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(source, sstride, zeroBuf,
-                           0) >> 2);
-        int reconEnergy = pixel_satd_4x4_neon(recon, rstride, zeroBuf, 0) - (sad_pp_neon<4, 4>(recon, rstride, zeroBuf,
-                          0) >> 2);
+        int sourceEnergy = calc_energy_4x4(source, sstride);
+        int reconEnergy = calc_energy_4x4(recon, rstride);
+
         return abs(sourceEnergy - reconEnergy);
     }
 }
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index e189fdcd7..480278e5e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -724,88 +724,6 @@ function PFX(ssim_4x4x2_core_neon)
     ret
 endfunc
 
-// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
-function PFX(psyCost_4x4_neon)
-    ld1r            {v4.2s}, [x0], x1
-    ld1r            {v5.2s}, [x0], x1
-    ld1             {v4.s}[1], [x0], x1
-    ld1             {v5.s}[1], [x0], x1
-
-    ld1r            {v6.2s}, [x2], x3
-    ld1r            {v7.2s}, [x2], x3
-    ld1             {v6.s}[1], [x2], x3
-    ld1             {v7.s}[1], [x2], x3
-
-    uaddl           v2.8h, v4.8b, v5.8b
-    usubl           v3.8h, v4.8b, v5.8b
-    uaddl           v18.8h, v6.8b, v7.8b
-    usubl           v19.8h, v6.8b, v7.8b
-
-    mov             v20.d[0], v2.d[1]
-    add             v0.4h, v2.4h, v20.4h
-    sub             v1.4h, v2.4h, v20.4h
-    mov             v21.d[0], v3.d[1]
-    add             v22.4h, v3.4h, v21.4h
-    sub             v23.4h, v3.4h, v21.4h
-
-    mov             v24.d[0], v18.d[1]
-    add             v16.4h, v18.4h, v24.4h
-    sub             v17.4h, v18.4h, v24.4h
-    mov             v25.d[0], v19.d[1]
-    add             v26.4h, v19.4h, v25.4h
-    sub             v27.4h, v19.4h, v25.4h
-
-    mov             v0.d[1], v22.d[0]
-    mov             v1.d[1], v23.d[0]
-    trn1            v22.8h, v0.8h, v1.8h
-    trn2            v23.8h, v0.8h, v1.8h
-    mov             v16.d[1], v26.d[0]
-    mov             v17.d[1], v27.d[0]
-    trn1            v26.8h, v16.8h, v17.8h
-    trn2            v27.8h, v16.8h, v17.8h
-
-    add             v2.8h, v22.8h, v23.8h
-    sub             v3.8h, v22.8h, v23.8h
-    add             v18.8h, v26.8h, v27.8h
-    sub             v19.8h, v26.8h, v27.8h
-
-    uaddl           v20.8h, v4.8b, v5.8b
-    uaddl           v21.8h, v6.8b, v7.8b
-
-    trn1            v0.4s, v2.4s, v3.4s
-    trn2            v1.4s, v2.4s, v3.4s
-    trn1            v16.4s, v18.4s, v19.4s
-    trn2            v17.4s, v18.4s, v19.4s
-    abs             v0.8h, v0.8h
-    abs             v16.8h, v16.8h
-    abs             v1.8h, v1.8h
-    abs             v17.8h, v17.8h
-
-    uaddlv          s20, v20.8h
-    uaddlv          s21, v21.8h
-    mov             v20.s[1], v21.s[0]
-
-    smax            v0.8h, v0.8h, v1.8h
-    smax            v16.8h, v16.8h, v17.8h
-
-    trn1            v4.2d, v0.2d, v16.2d
-    trn2            v5.2d, v0.2d, v16.2d
-    add             v0.8h, v4.8h, v5.8h
-    mov             v4.d[0], v0.d[1]
-    uaddlv          s0, v0.4h
-    uaddlv          s4, v4.4h
-
-    ushr            v20.2s, v20.2s, #2
-    mov             v0.s[1], v4.s[0]
-    sub             v0.2s, v0.2s, v20.2s
-    mov             w0, v0.s[0]
-    mov             w1, v0.s[1]
-    subs            w0, w0, w1
-    cneg            w0, w0, mi
-
-    ret
-endfunc
-
 // uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
 function PFX(quant_neon)
     mov             w9, #1
-- 
2.39.5 (Apple Git-154)