[x265] [PATCH 1/3] AArch64: Clean up satd/sa8d functions

Li Zhang li.zhang2 at arm.com
Wed Apr 30 18:17:36 UTC 2025


Clean up and optimize the Neon intrinsics implementation of the
satd/sa8d primitives for all bitdepths.

Remove the Neon and SVE assembly implementations of these primitives
since they are now slower than the Neon intrinsics implementations.
---
 source/common/aarch64/asm-primitives.cpp |   76 --
 source/common/aarch64/mem-neon.h         |    6 +-
 source/common/aarch64/pixel-prim.cpp     | 1407 +++++++++++-----------
 source/common/aarch64/pixel-util-sve.S   |  258 ----
 source/common/aarch64/pixel-util.S       |  957 ---------------
 5 files changed, 712 insertions(+), 1992 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6097f7655..4d2c575d1 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -652,64 +652,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
     p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
 
-    // satd
-    ALL_LUMA_PU(satd, pixel_satd, neon);
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = PFX(pixel_satd_8x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = PFX(pixel_satd_8x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = PFX(pixel_satd_4x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = PFX(pixel_satd_16x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = PFX(pixel_satd_8x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = PFX(pixel_satd_16x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = PFX(pixel_satd_4x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = PFX(pixel_satd_32x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = PFX(pixel_satd_8x32_neon);
-
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = PFX(pixel_satd_4x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = PFX(pixel_satd_8x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = PFX(pixel_satd_8x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = PFX(pixel_satd_4x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = PFX(pixel_satd_8x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = PFX(pixel_satd_8x12_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = PFX(pixel_satd_8x4_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = PFX(pixel_satd_16x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = PFX(pixel_satd_4x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = PFX(pixel_satd_8x64_neon);
-
-    // sa8d
-    p.cu[BLOCK_4x4].sa8d   = PFX(pixel_satd_4x4_neon);
-    p.cu[BLOCK_8x8].sa8d   = PFX(pixel_sa8d_8x8_neon);
-    p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
-    p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
-    p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
     // dequant_scaling
     p.dequant_scaling = PFX(dequant_scaling_neon);
 
@@ -857,24 +799,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
 
 #if !HIGH_BIT_DEPTH
     p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps  = PFX(pixel_sub_ps_8x16_sve);
-
-    // satd
-    p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
-    p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_sve);
-    p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
-    p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = PFX(pixel_satd_8x4_sve);
-
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = PFX(pixel_satd_8x12_sve);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = PFX(pixel_satd_8x4_sve);
-
-    // sa8d
-    p.cu[BLOCK_4x4].sa8d   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
 #else // HIGH_BIT_DEPTH
     // sse_pp
     p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_sve);
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 263c1d569..8bd5fbee9 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -106,8 +106,7 @@ static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
 {
     for (int i = 0; i < N; ++i)
     {
-        dst[i] = vld1q_u8(src);
-        src += stride;
+        dst[i] = vld1q_u8(src + i * stride);
     }
 }
 
@@ -230,8 +229,7 @@ static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
 {
     for (int i = 0; i < N; ++i)
     {
-        dst[i] = vld1q_u16(src);
-        src += stride;
+        dst[i] = vld1q_u16(src + i * stride);
     }
 }
 
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 15ccdff22..67c388b59 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -19,799 +19,805 @@ namespace
 {
 
 
-/* SATD SA8D variants - based on x264 */
-static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+static inline void sumsubq_s16(int16x8_t *sum, int16x8_t *sub, const int16x8_t a, const int16x8_t b)
 {
-    sum = vaddq_s16(a, b);
-    sub = vsubq_s16(a, b);
+    *sum = vaddq_s16(a, b);
+    *sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s16x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s16(s1, s2);
-    t2 = vtrn2q_s16(s1, s2);
+    *t1 = vtrn1q_s16(s1, s2);
+    *t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s32x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
     int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
 
-    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
 }
 
-static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s64x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
     int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
 
-    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
 }
 
-static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
-                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
+static inline uint16x8_t max_abs_s16(const int16x8_t a, const int16x8_t b)
 {
-    SUMSUB_AB(s1, d1, a, b);
-    SUMSUB_AB(s2, d2, c, d);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(a));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(b));
+
+    return vmaxq_u16(abs0, abs1);
 }
 
-static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
-                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+#if X265_DEPTH == 12
+static inline void sumsubq_s32(int32x4_t *sum, int32x4_t *sub, const int32x4_t a, const int32x4_t b)
 {
-    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
-    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+    *sum = vaddq_s32(a, b);
+    *sub = vsubq_s32(a, b);
 }
 
-
-static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
-
+static inline void sumsublq_s16(int32x4_t *sum_lo, int32x4_t *sum_hi,
+                                int32x4_t *sub_lo, int32x4_t *sub_hi,
+                                const int16x8_t a, const int16x8_t b)
 {
+    *sum_lo = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sub_lo = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sum_hi = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+    *sub_hi = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+}
 
-    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
-
-
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
-
-    SUMSUB_AB(v4 , v6 , v16, v18);
-    SUMSUB_AB(v5 , v7 , v17, v19);
-
-    transpose_8h_8h(v0, v1, v4, v5);
-    transpose_8h_8h(v2, v3, v6, v7);
+static inline void transpose_inplace_s32_s64x2(int32x4_t *t1, int32x4_t *t2)
+{
+    int64x2_t tmp1 = vreinterpretq_s64_s32(*t1);
+    int64x2_t tmp2 = vreinterpretq_s64_s32(*t2);
 
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
+    *t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+}
 
-    transpose_4s_8h(v0, v1, v16, v18);
-    transpose_4s_8h(v2, v3, v17, v19);
+static inline uint32x4_t max_abs_s32(int32x4_t a, int32x4_t b)
+{
+    uint32x4_t abs0 = vreinterpretq_u32_s32(vabsq_s32(a));
+    uint32x4_t abs1 = vreinterpretq_u32_s32(vabsq_s32(b));
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+    return vmaxq_u32(abs0, abs1);
+}
 
-    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
-    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
+#endif // X265_DEPTH == 12
 
-    uint16x8_t sum = vaddq_u16(max0, max1);
-    return vaddlvq_u16(sum);
+#if HIGH_BIT_DEPTH
+static inline void load_diff_u16x8x4(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+    uint16x8_t r[4], t[4];
+    load_u16x8xn<4>(pix1, stride_pix1, r);
+    load_u16x8xn<4>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
 }
 
-static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+static inline void load_diff_u16x8x4_dual(const uint16_t *pix1, intptr_t stride_pix1,
+                                          const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
 {
-    int16x8_t v2, v3;
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_2d_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_8h_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+                      pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
+}
 
-    transpose_4s_8h(v0, v1, v2, v3);
+static inline void load_diff_u16x8x8(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+    uint16x8_t r[8], t[8];
+    load_u16x8xn<8>(pix1, stride_pix1, r);
+    load_u16x8xn<8>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
+    diff[4] = vreinterpretq_s16_u16(vsubq_u16(r[4], t[4]));
+    diff[5] = vreinterpretq_s16_u16(vsubq_u16(r[5], t[5]));
+    diff[6] = vreinterpretq_s16_u16(vsubq_u16(r[6], t[6]));
+    diff[7] = vreinterpretq_s16_u16(vsubq_u16(r[7], t[7]));
+}
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t max = vmaxq_u16(abs0, abs1);
+#else // !HIGH_BIT_DEPTH
+static inline void load_diff_u8x8x4(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+    uint8x8_t r[4], t[4];
+    load_u8x8xn<4>(pix1, stride_pix1, r);
+    load_u8x8xn<4>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(r[3], t[3]));
+}
 
-    return vaddlvq_u16(max);
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+                     pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
 }
 
-static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
-                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline void load_diff_u8x16x4(const uint8_t *pix1, intptr_t stride_pix1,
+                                     const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
 {
-    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+    uint8x16_t s1[4], s2[4];
+    load_u8x16xn<4>(pix1, stride_pix1, s1);
+    load_u8x16xn<4>(pix2, stride_pix2, s2);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[0]), vget_low_u8(s2[0])));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[1]), vget_low_u8(s2[1])));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[2]), vget_low_u8(s2[2])));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[3]), vget_low_u8(s2[3])));
+    diff[4] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[0]), vget_high_u8(s2[0])));
+    diff[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[1]), vget_high_u8(s2[1])));
+    diff[6] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[2]), vget_high_u8(s2[2])));
+    diff[7] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[3]), vget_high_u8(s2[3])));
+}
 
-    SUMSUB_AB(v16, v18, v0,  v2);
-    SUMSUB_AB(v17, v19, v1,  v3);
+#endif // HIGH_BIT_DEPTH
 
-    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+// 4 way hadamard vertical pass.
+static inline void hadamard_4_v(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
+{
+    int16x8_t s0, s1, d0, d1;
 
-    transpose_8h_8h(v0,  v1,  v16, v17);
-    transpose_8h_8h(v2,  v3,  v18, v19);
-    transpose_8h_8h(v4,  v5,  v20, v21);
-    transpose_8h_8h(v6,  v7,  v22, v23);
+    sumsubq_s16(&s0, &d0, in_coefs[0], in_coefs[1]);
+    sumsubq_s16(&s1, &d1, in_coefs[2], in_coefs[3]);
 
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
-    SUMSUB_AB(v20, v21, v4,  v5);
-    SUMSUB_AB(v22, v23, v6,  v7);
+    sumsubq_s16(&out_coefs[0], &out_coefs[2], s0, s1);
+    sumsubq_s16(&out_coefs[1], &out_coefs[3], d0, d1);
+}
 
-    transpose_4s_8h(v0,  v2,  v16, v18);
-    transpose_4s_8h(v1,  v3,  v17, v19);
-    transpose_4s_8h(v4,  v6,  v20, v22);
-    transpose_4s_8h(v5,  v7,  v21, v23);
+// 8 way hadamard vertical pass.
+static inline void hadamard_8_v(const int16x8_t in_coefs[8], int16x8_t out_coefs[8])
+{
+    int16x8_t temp[8];
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
-    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
-    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
-    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
-    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+    hadamard_4_v(in_coefs, temp);
+    hadamard_4_v(in_coefs + 4, temp + 4);
 
-    v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
-    v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
-    v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
-    v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
+    sumsubq_s16(&out_coefs[0], &out_coefs[4], temp[0], temp[4]);
+    sumsubq_s16(&out_coefs[1], &out_coefs[5], temp[1], temp[5]);
+    sumsubq_s16(&out_coefs[2], &out_coefs[6], temp[2], temp[6]);
+    sumsubq_s16(&out_coefs[3], &out_coefs[7], temp[3], temp[7]);
 }
 
-#if HIGH_BIT_DEPTH
-
-#if (X265_DEPTH > 10)
-static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
-                                   const int32x4_t s1, const int32x4_t s2)
+// 4 way hadamard horizontal pass.
+static inline void hadamard_4_h(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
 {
-    int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
-    int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+    int16x8_t s0, s1, d0, d1, t0, t1, t2, t3;
+
+    transpose_s16_s16x2(&t0, &t1, in_coefs[0], in_coefs[1]);
+    transpose_s16_s16x2(&t2, &t3, in_coefs[2], in_coefs[3]);
+
+    sumsubq_s16(&s0, &d0, t0, t1);
+    sumsubq_s16(&s1, &d1, t2, t3);
 
-    t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
-    t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+    transpose_s16_s32x2(&out_coefs[0], &out_coefs[1], s0, s1);
+    transpose_s16_s32x2(&out_coefs[2], &out_coefs[3], d0, d1);
 }
 
-static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+#if X265_DEPTH != 12
+// 8 way hadamard horizontal pass.
+static inline void hadamard_8_h(int16x8_t coefs[8], uint16x8_t out[4])
 {
-    sum = vaddq_s32(a, b);
-    sub = vsubq_s32(a, b);
+    int16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    int16x8_t temp[8];
+
+    hadamard_4_h(coefs, temp);
+    hadamard_4_h(coefs + 4, temp + 4);
+
+    sumsubq_s16(&s0, &d0, temp[0], temp[1]);
+    sumsubq_s16(&s1, &d1, temp[2], temp[3]);
+    sumsubq_s16(&s2, &d2, temp[4], temp[5]);
+    sumsubq_s16(&s3, &d3, temp[6], temp[7]);
+
+    transpose_s16_s64x2(&temp[0], &temp[1], s0, s2);
+    transpose_s16_s64x2(&temp[2], &temp[3], s1, s3);
+    transpose_s16_s64x2(&temp[4], &temp[5], d0, d2);
+    transpose_s16_s64x2(&temp[6], &temp[7], d1, d3);
+
+    out[0] = max_abs_s16(temp[0], temp[1]);
+    out[1] = max_abs_s16(temp[2], temp[3]);
+    out[2] = max_abs_s16(temp[4], temp[5]);
+    out[3] = max_abs_s16(temp[6], temp[7]);
 }
 
-static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
-        const int16x8_t a, const int16x8_t b)
+#else // X265_DEPTH == 12
+static inline void hadamard_8_h(int16x8_t coefs[8], uint32x4_t out[4])
 {
-    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-    sumh = vaddl_high_s16(a, b);
-    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-    subh = vsubl_high_s16(a, b);
+    int16x8_t a[8];
+
+    transpose_s16_s16x2(&a[0], &a[1], coefs[0], coefs[1]);
+    transpose_s16_s16x2(&a[2], &a[3], coefs[2], coefs[3]);
+    transpose_s16_s16x2(&a[4], &a[5], coefs[4], coefs[5]);
+    transpose_s16_s16x2(&a[6], &a[7], coefs[6], coefs[7]);
+
+    int32x4_t a_lo[8], a_hi[8], b_lo[8], b_hi[8];
+
+    sumsublq_s16(&a_lo[0], &a_hi[0], &a_lo[4], &a_hi[4], a[0], a[1]);
+    sumsublq_s16(&a_lo[1], &a_hi[1], &a_lo[5], &a_hi[5], a[2], a[3]);
+    sumsublq_s16(&a_lo[2], &a_hi[2], &a_lo[6], &a_hi[6], a[4], a[5]);
+    sumsublq_s16(&a_lo[3], &a_hi[3], &a_lo[7], &a_hi[7], a[6], a[7]);
+
+    transpose_inplace_s32_s64x2(&a_lo[0], &a_lo[1]);
+    transpose_inplace_s32_s64x2(&a_lo[2], &a_lo[3]);
+    transpose_inplace_s32_s64x2(&a_lo[4], &a_lo[5]);
+    transpose_inplace_s32_s64x2(&a_lo[6], &a_lo[7]);
+
+    transpose_inplace_s32_s64x2(&a_hi[0], &a_hi[1]);
+    transpose_inplace_s32_s64x2(&a_hi[2], &a_hi[3]);
+    transpose_inplace_s32_s64x2(&a_hi[4], &a_hi[5]);
+    transpose_inplace_s32_s64x2(&a_hi[6], &a_hi[7]);
+
+    sumsubq_s32(&b_lo[0], &b_lo[1], a_lo[0], a_lo[1]);
+    sumsubq_s32(&b_lo[2], &b_lo[3], a_lo[2], a_lo[3]);
+    sumsubq_s32(&b_lo[4], &b_lo[5], a_lo[4], a_lo[5]);
+    sumsubq_s32(&b_lo[6], &b_lo[7], a_lo[6], a_lo[7]);
+
+    sumsubq_s32(&b_hi[0], &b_hi[1], a_hi[0], a_hi[1]);
+    sumsubq_s32(&b_hi[2], &b_hi[3], a_hi[2], a_hi[3]);
+    sumsubq_s32(&b_hi[4], &b_hi[5], a_hi[4], a_hi[5]);
+    sumsubq_s32(&b_hi[6], &b_hi[7], a_hi[6], a_hi[7]);
+
+    uint32x4_t max0_lo = max_abs_s32(b_lo[0], b_hi[0]);
+    uint32x4_t max1_lo = max_abs_s32(b_lo[1], b_hi[1]);
+    uint32x4_t max2_lo = max_abs_s32(b_lo[2], b_hi[2]);
+    uint32x4_t max3_lo = max_abs_s32(b_lo[3], b_hi[3]);
+    uint32x4_t max0_hi = max_abs_s32(b_lo[4], b_hi[4]);
+    uint32x4_t max1_hi = max_abs_s32(b_lo[5], b_hi[5]);
+    uint32x4_t max2_hi = max_abs_s32(b_lo[6], b_hi[6]);
+    uint32x4_t max3_hi = max_abs_s32(b_lo[7], b_hi[7]);
+
+    out[0] = vaddq_u32(max0_lo, max0_hi);
+    out[1] = vaddq_u32(max1_lo, max1_hi);
+    out[2] = vaddq_u32(max2_lo, max2_hi);
+    out[3] = vaddq_u32(max3_lo, max3_hi);
 }
 
-#endif
+#endif // X265_DEPTH != 12
 
-static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
-                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
-                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int hadamard_4x4(int16x8_t a0, int16x8_t a1)
 {
-    uint16x8_t r0, r1, r2, r3;
-    uint16x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17;
-    int16x8_t v18, v19;
-
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1);
-
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2);
-
-    v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    int16x8_t sum, dif, t0, t1;
+    sumsubq_s16(&sum, &dif, a0, a1);
 
-    r0 = vld1q_u16(pix1 + 4 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 5 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 6 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 7 * stride_pix1);
+    transpose_s16_s64x2(&t0, &t1, sum, dif);
+    sumsubq_s16(&sum, &dif, t0, t1);
 
-    t0 = vld1q_u16(pix2 + 4 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 5 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 6 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 7 * stride_pix2);
+    transpose_s16_s16x2(&t0, &t1, sum, dif);
+    sumsubq_s16(&sum, &dif, t0, t1);
 
-    v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    transpose_s16_s32x2(&t0, &t1, sum, dif);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    uint16x8_t max = max_abs_s16(t0, t1);
 
+    return vaddlvq_u16(max);
 }
 
-
-
-
-static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
-                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+// Calculate 2 4x4 hadamard transformation.
+static void hadamard_4x4_dual(int16x8_t diff[4], uint16x8_t *out)
 {
-    uint16x8_t r0, r1, r2, r3;
-    uint16x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17, v20, v21;
-    int16x8_t v18, v19, v22, v23;
+    int16x8_t temp[4];
 
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1);
+    hadamard_4_v(diff, temp);
+    hadamard_4_h(temp, diff);
 
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2);
+    uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+    uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
 
-    v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
-
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1 + 8);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1 + 8);
+    *out = vaddq_u16(sum0, sum1);
+}
 
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2 + 8);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2 + 8);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8);
+// Calculate 4 4x4 hadamard transformation.
+static inline void hadamard_4x4_quad(int16x8_t diff[8], uint16x8_t out[2])
+{
+    int16x8_t temp[8];
 
-    v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    hadamard_4_v(diff, temp);
+    hadamard_4_v(diff + 4, temp + 4);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    hadamard_4_h(temp, diff);
+    hadamard_4_h(temp + 4, diff + 4);
 
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+    uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+    uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
+    uint16x8_t sum2 = max_abs_s16(diff[4], diff[5]);
+    uint16x8_t sum3 = max_abs_s16(diff[6], diff[7]);
 
+    out[0] = vaddq_u16(sum0, sum1);
+    out[1] = vaddq_u16(sum2, sum3);
 }
 
-
-int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if X265_DEPTH == 8
+static inline void hadamard_8x8(int16x8_t diff[8], uint16x8_t out[2])
 {
-    uint16x4_t t0_0 = vld1_u16(pix1 + 0 * stride_pix1);
-    uint16x4_t t1_0 = vld1_u16(pix1 + 1 * stride_pix1);
-    uint16x4_t t0_1 = vld1_u16(pix1 + 2 * stride_pix1);
-    uint16x4_t t1_1 = vld1_u16(pix1 + 3 * stride_pix1);
-    uint16x8_t t0 = vcombine_u16(t0_0, t0_1);
-    uint16x8_t t1 = vcombine_u16(t1_0, t1_1);
+    int16x8_t temp[8];
+    uint16x8_t sum[4];
 
-    uint16x4_t r0_0 = vld1_u16(pix2 + 0 * stride_pix2);
-    uint16x4_t r1_0 = vld1_u16(pix2 + 1 * stride_pix2);
-    uint16x4_t r0_1 = vld1_u16(pix2 + 2 * stride_pix2);
-    uint16x4_t r1_1 = vld1_u16(pix2 + 3 * stride_pix2);
-    uint16x8_t r0 = vcombine_u16(r0_0, r0_1);
-    uint16x8_t r1 = vcombine_u16(r1_0, r1_1);
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-
-    return _satd_4x4_neon(v0, v1);
+    out[0] = vaddq_u16(sum[0], sum[1]);
+    out[1] = vaddq_u16(sum[2], sum[3]);
 }
 
+#elif X265_DEPTH == 10
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
+{
+    int16x8_t temp[8];
+    uint16x8_t sum[4];
 
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
+    out[0] = vpaddlq_u16(sum[0]);
+    out[1] = vpaddlq_u16(sum[1]);
+    out[0] = vpadalq_u16(out[0], sum[2]);
+    out[1] = vpadalq_u16(out[1], sum[3]);
+}
 
-
-
-int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#elif X265_DEPTH == 12
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
 {
-    uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
-
-    i0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    i1 = vld1q_u16(pix2 + 0 * stride_pix2);
-    i2 = vld1q_u16(pix1 + 1 * stride_pix1);
-    i3 = vld1q_u16(pix2 + 1 * stride_pix2);
-    i4 = vld1q_u16(pix1 + 2 * stride_pix1);
-    i5 = vld1q_u16(pix2 + 2 * stride_pix2);
-    i6 = vld1q_u16(pix1 + 3 * stride_pix1);
-    i7 = vld1q_u16(pix2 + 3 * stride_pix2);
+    int16x8_t temp[8];
+    uint32x4_t sum[4];
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3));
-    int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5));
-    int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7));
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
-    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+    out[0] = vaddq_u32(sum[0], sum[1]);
+    out[1] = vaddq_u32(sum[2], sum[3]);
 }
 
+#endif // X265_DEPTH == 8
 
-int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
-    int16x8_t v0, v1, v2, v3;
+    uint16x4_t s[4], r[4];
+    load_u16x4xn<4>(pix1, stride_pix1, s);
+    load_u16x4xn<4>(pix2, stride_pix2, r);
 
-    for (int offset = 0; offset <= 12; offset += 4)
-    {
-        _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1,
-                        pix2 + offset * stride_pix2,stride_pix2,
-                        v0, v1, v2, v3);
-        v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0));
-        v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1));
-        v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2));
-        v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3));
-    }
+    uint16x8_t s0 = vcombine_u16(s[0], s[2]);
+    uint16x8_t s1 = vcombine_u16(s[1], s[3]);
+    uint16x8_t r0 = vcombine_u16(r[0], r[2]);
+    uint16x8_t r1 = vcombine_u16(r[1], r[3]);
+
+    int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
 
-    return vaddvq_u32(vaddq_u32(v30, v31));
+    return hadamard_4x4(diff0, diff1);
 }
 
-#else       //HIGH_BIT_DEPTH
+static inline int pixel_satd_4x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[4];
+
+    uint16x4_t s[8], r[8];
+    load_u16x4xn<8>(pix1, stride_pix1, s);
+    load_u16x4xn<8>(pix2, stride_pix2, r);
+
+    uint16x8_t s0 = vcombine_u16(s[0], s[4]);
+    uint16x8_t s1 = vcombine_u16(s[1], s[5]);
+    uint16x8_t s2 = vcombine_u16(s[2], s[6]);
+    uint16x8_t s3 = vcombine_u16(s[3], s[7]);
+    uint16x8_t r0 = vcombine_u16(r[0], r[4]);
+    uint16x8_t r1 = vcombine_u16(r[1], r[5]);
+    uint16x8_t r2 = vcombine_u16(r[2], r[6]);
+    uint16x8_t r3 = vcombine_u16(r[3], r[7]);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r3, s3));
+
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
+
+    return vaddlvq_u16(out);
+}
 
-static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
-                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x16_t r0, r1, r2, r3;
-    uint8x16_t t0, t1, t2, t3;
-    int16x8_t v16, v17, v20, v21;
-    int16x8_t v18, v19, v22, v23;
+    int16x8_t diff[4];
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
 
-    r0 = vld1q_u8(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u8(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u8(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u8(pix1 + 3 * stride_pix1);
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
 
-    t0 = vld1q_u8(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u8(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u8(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u8(pix2 + 3 * stride_pix2);
+    return vaddlvq_u16(out);
+}
 
-    v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)));
-    v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)));
-    v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)));
-    v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)));
-    v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3));
+static inline int pixel_satd_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
 
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+    uint32x4_t res = vpaddlq_u16(out[0]);
+    res = vpadalq_u16(res, out[1]);
 
+    return vaddvq_u32(res);
 }
 
-
-static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
-                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
-                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int pixel_satd_8x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x8_t r0, r1, r2, r3;
-    uint8x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17;
-    int16x8_t v18, v19;
+    int16x8_t diff[16];
+    uint16x8_t out[4];
+
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4_dual(pix1 + 8 * stride_pix1, stride_pix1,
+                           pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
 
-    r0 = vld1_u8(pix1 + 0 * stride_pix1);
-    r1 = vld1_u8(pix1 + 1 * stride_pix1);
-    r2 = vld1_u8(pix1 + 2 * stride_pix1);
-    r3 = vld1_u8(pix1 + 3 * stride_pix1);
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
-    t0 = vld1_u8(pix2 + 0 * stride_pix2);
-    t1 = vld1_u8(pix2 + 1 * stride_pix2);
-    t2 = vld1_u8(pix2 + 2 * stride_pix2);
-    t3 = vld1_u8(pix2 + 3 * stride_pix2);
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-    v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+    uint32x4_t res = vpaddlq_u16(sum0);
+    res = vpadalq_u16(res, sum1);
 
-    r0 = vld1_u8(pix1 + 4 * stride_pix1);
-    r1 = vld1_u8(pix1 + 5 * stride_pix1);
-    r2 = vld1_u8(pix1 + 6 * stride_pix1);
-    r3 = vld1_u8(pix1 + 7 * stride_pix1);
+    return vaddvq_u32(res);
+}
 
-    t0 = vld1_u8(pix2 + 4 * stride_pix2);
-    t1 = vld1_u8(pix2 + 5 * stride_pix2);
-    t2 = vld1_u8(pix2 + 6 * stride_pix2);
-    t3 = vld1_u8(pix2 + 7 * stride_pix2);
+static inline int pixel_satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
 
-    v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
+    uint16x8_t sum0, sum1;
+    hadamard_4x4_dual(diff, &sum0);
+    hadamard_4x4_dual(diff + 4, &sum1);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    sum0 = vaddq_u16(sum0, sum1);
 
+    return vaddlvq_u16(sum0);
 }
 
-int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x8_t t0 = load_u8x4x2(pix1, 2 * stride_pix1);
-    uint8x8_t t1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
-    uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4_dual(pix1 + 8, stride_pix1,  pix2 + 8, stride_pix2, diff + 8);
 
-    return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)),
-                          vreinterpretq_s16_u16(vsubl_u8(r1, t1)));
-}
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
+#if X265_DEPTH == 10
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
-{
-    uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
+    sum0 = vaddq_u16(sum0, sum1);
 
-    i0 = vld1_u8(pix1 + 0 * stride_pix1);
-    i1 = vld1_u8(pix2 + 0 * stride_pix2);
-    i2 = vld1_u8(pix1 + 1 * stride_pix1);
-    i3 = vld1_u8(pix2 + 1 * stride_pix2);
-    i4 = vld1_u8(pix1 + 2 * stride_pix1);
-    i5 = vld1_u8(pix2 + 2 * stride_pix2);
-    i6 = vld1_u8(pix1 + 3 * stride_pix1);
-    i7 = vld1_u8(pix2 + 3 * stride_pix2);
+    return vaddlvq_u16(sum0);
+#else // X265_DEPTH == 12
+    uint32x4_t sum0 = vpaddlq_u16(out[0]);
+    uint32x4_t sum1 = vpaddlq_u16(out[1]);
+    sum0 = vpadalq_u16(sum0, out[2]);
+    sum1 = vpadalq_u16(sum1, out[3]);
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3));
-    int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5));
-    int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7));
+    sum0 = vaddq_u32(sum0, sum1);
 
-    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+    return vaddvq_u32(sum0);
+#endif // X265_DEPTH == 10
 }
 
-int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                        const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint16x8_t v30, v31;
-    int16x8_t v0, v1, v2, v3;
-    uint16x8_t t0, t1;
+    uint32x4_t sum[2]= { vdupq_n_u32(0), vdupq_n_u32(0) };
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
-    v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+    for (int i = 0; i < 4; ++i)
+    {
+        load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+        load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
-    _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        hadamard_4x4_quad(diff, out);
 
-    _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        sum[0] = vpadalq_u16(sum[0], out[0]);
+        sum[1] = vpadalq_u16(sum[1], out[1]);
 
-    _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        pix1 += 4 * stride_pix1;
+        pix2 += 4 * stride_pix2;
+    }
 
-    uint32x4_t sum0 = vpaddlq_u16(v30);
-    uint32x4_t sum1 = vpaddlq_u16(v31);
-    sum0 = vaddq_u32(sum0, sum1);
-    return vaddvq_u32(sum0);
+    return vaddvq_u32(vaddq_u32(sum[0], sum[1]));
 }
-#endif      //HIGH_BIT_DEPTH
 
-#if HIGH_BIT_DEPTH
-typedef uint32x4_t sa8d_out_type;
-#else
-typedef uint16x8_t sa8d_out_type;
-#endif
-
-static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
-                                      int16x8_t v3, int16x8_t v20,
-                                      int16x8_t v21, int16x8_t v22,
-                                      int16x8_t v23, sa8d_out_type &out0,
-                                      sa8d_out_type &out1)
+static inline int pixel_sa8d_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v16, v17, v18, v19;
-    int16x8_t v4, v5, v6, v7;
+    int16x8_t diff[8];
+    uint32x4_t res[2];
 
-    SUMSUB_AB(v16, v18, v0,  v2);
-    SUMSUB_AB(v17, v19, v1,  v3);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
 
-    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+    uint32x4_t s = vaddq_u32(res[0], res[1]);
 
-    SUMSUB_AB(v0,  v16, v16, v20);
-    SUMSUB_AB(v1,  v17, v17, v21);
-    SUMSUB_AB(v2,  v18, v18, v22);
-    SUMSUB_AB(v3,  v19, v19, v23);
-
-    transpose_8h_8h(v20, v21, v16, v17);
-    transpose_8h_8h(v4,  v5,  v0,  v1);
-    transpose_8h_8h(v22, v23, v18, v19);
-    transpose_8h_8h(v6,  v7,  v2,  v3);
+    return (vaddvq_u32(s) + 1) >> 1;
+}
 
-#if (X265_DEPTH <= 10)
+static inline int pixel_sa8d_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                        const uint16_t *pix2, intptr_t stride_pix2)
+{
+    uint32x4_t sum0, sum1;
 
-    int16x8_t v24, v25;
+    int16x8_t diff[8];
+    uint32x4_t res[2];
 
-    SUMSUB_AB(v2,  v3,  v20, v21);
-    SUMSUB_AB(v24, v25, v4,  v5);
-    SUMSUB_AB(v0,  v1,  v22, v23);
-    SUMSUB_AB(v4,  v5,  v6,  v7);
+    load_diff_u16x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(res[0], res[1]);
 
-    transpose_4s_8h(v20, v22, v2,  v0);
-    transpose_4s_8h(v21, v23, v3,  v1);
-    transpose_4s_8h(v16, v18, v24, v4);
-    transpose_4s_8h(v17, v19, v25, v5);
+    load_diff_u16x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum1 = vaddq_u32(res[0], res[1]);
 
-    SUMSUB_AB(v0,  v2,  v20, v22);
-    SUMSUB_AB(v1,  v3,  v21, v23);
-    SUMSUB_AB(v4,  v6,  v16, v18);
-    SUMSUB_AB(v5,  v7,  v17, v19);
+    load_diff_u16x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                      pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(sum0, res[0]);
+    sum1 = vaddq_u32(sum1, res[1]);
 
-    transpose_2d_8h(v16, v20,  v0,  v4);
-    transpose_2d_8h(v17, v21,  v1,  v5);
-    transpose_2d_8h(v18, v22,  v2,  v6);
-    transpose_2d_8h(v19, v23,  v3,  v7);
+    load_diff_u16x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+                      pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(sum0, res[0]);
+    sum1 = vaddq_u32(sum1, res[1]);
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19));
-    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20));
-    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21));
-    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22));
-    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23));
+    sum0 = vaddq_u32(sum0, sum1);
 
-    uint16x8_t max0 = vmaxq_u16(abs0, abs4);
-    uint16x8_t max1 = vmaxq_u16(abs1, abs5);
-    uint16x8_t max2 = vmaxq_u16(abs2, abs6);
-    uint16x8_t max3 = vmaxq_u16(abs3, abs7);
+    return (vaddvq_u32(sum0) + 1) >> 1;
+}
 
-#if HIGH_BIT_DEPTH
-    out0 = vpaddlq_u16(max0);
-    out1 = vpaddlq_u16(max1);
-    out0 = vpadalq_u16(out0, max2);
-    out1 = vpadalq_u16(out1, max3);
-
-#else //HIGH_BIT_DEPTH
-
-    out0 = vaddq_u16(max0, max1);
-    out1 = vaddq_u16(max2, max3);
-
-#endif //HIGH_BIT_DEPTH
-
-#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
-
-    int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
-    int32x4_t v22l, v22h, v23l, v23h;
-    int32x4_t v4l, v4h, v5l, v5h;
-    int32x4_t v6l, v6h, v7l, v7h;
-    int32x4_t v16l, v16h, v17l, v17h;
-    int32x4_t v18l, v18h, v19l, v19h;
-    int32x4_t v20l, v20h, v21l, v21h;
-
-    ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
-    ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
-
-    v22l = vmovl_s16(vget_low_s16(v22));
-    v22h = vmovl_high_s16(v22);
-    v23l = vmovl_s16(vget_low_s16(v23));
-    v23h = vmovl_high_s16(v23);
-
-    ISUMSUB_AB(v0l,  v1l,  v22l, v23l);
-    ISUMSUB_AB(v0h,  v1h,  v22h, v23h);
-
-    v6l = vmovl_s16(vget_low_s16(v6));
-    v6h = vmovl_high_s16(v6);
-    v7l = vmovl_s16(vget_low_s16(v7));
-    v7h = vmovl_high_s16(v7);
-
-    ISUMSUB_AB(v4l,  v5l,  v6l,  v7l);
-    ISUMSUB_AB(v4h,  v5h,  v6h,  v7h);
-
-    transpose_2d_4s(v20l, v22l, v2l,  v0l);
-    transpose_2d_4s(v21l, v23l, v3l,  v1l);
-    transpose_2d_4s(v16l, v18l, v24l, v4l);
-    transpose_2d_4s(v17l, v19l, v25l, v5l);
-
-    transpose_2d_4s(v20h, v22h, v2h,  v0h);
-    transpose_2d_4s(v21h, v23h, v3h,  v1h);
-    transpose_2d_4s(v16h, v18h, v24h, v4h);
-    transpose_2d_4s(v17h, v19h, v25h, v5h);
-
-    ISUMSUB_AB(v0l,  v2l,  v20l, v22l);
-    ISUMSUB_AB(v1l,  v3l,  v21l, v23l);
-    ISUMSUB_AB(v4l,  v6l,  v16l, v18l);
-    ISUMSUB_AB(v5l,  v7l,  v17l, v19l);
-
-    ISUMSUB_AB(v0h,  v2h,  v20h, v22h);
-    ISUMSUB_AB(v1h,  v3h,  v21h, v23h);
-    ISUMSUB_AB(v4h,  v6h,  v16h, v18h);
-    ISUMSUB_AB(v5h,  v7h,  v17h, v19h);
-
-    v16l = v0l;
-    v16h = v4l;
-    v20l = v0h;
-    v20h = v4h;
-
-    v17l = v1l;
-    v17h = v5l;
-    v21l = v1h;
-    v21h = v5h;
-
-    v18l = v2l;
-    v18h = v6l;
-    v22l = v2h;
-    v22h = v6h;
-
-    v19l = v3l;
-    v19h = v7l;
-    v23l = v3h;
-    v23h = v7h;
-
-    uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l));
-    uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l));
-    uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l));
-    uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l));
-    uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l));
-    uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l));
-    uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l));
-    uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l));
-
-    uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h));
-    uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h));
-    uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h));
-    uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h));
-    uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h));
-    uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h));
-    uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h));
-    uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h));
-
-    uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo);
-    uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo);
-    uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo);
-    uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo);
-
-    uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi);
-    uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi);
-    uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi);
-    uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi);
-
-    uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi);
-    uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi);
-    uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi);
-    uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi);
-
-    out0 = vaddq_u32(sum0, sum1);
-    out1 = vaddq_u32(sum2, sum3);
+#else // !HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    uint8x8_t s0 = load_u8x4x2(pix1, 2 * stride_pix1);
+    uint8x8_t s1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
 
+    uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
+    uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
 
-#endif
+    int16x8_t diff0 = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+    int16x8_t diff1 = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
 
+    return hadamard_4x4(diff0, diff1);
 }
 
-
-
-static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
-                                  int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_4x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
 {
+    int16x8_t diff[4];
+
+    uint8x8_t s0 = load_u8x4x2(pix1 + 0 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s1 = load_u8x4x2(pix1 + 1 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s2 = load_u8x4x2(pix1 + 2 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s3 = load_u8x4x2(pix1 + 3 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t r0 = load_u8x4x2(pix2 + 0 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r1 = load_u8x4x2(pix2 + 1 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r2 = load_u8x4x2(pix2 + 2 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r3 = load_u8x4x2(pix2 + 3 * stride_pix2, 4 * stride_pix2);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(s2, r2));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(r3, s3));
+
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
+
+    return vaddlvq_u16(out);
+}
 
-    int16x8_t v20, v21, v22, v23;
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+static inline int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[4];
 
-}
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
 
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
 
+    return vaddlvq_u16(out);
+}
 
-int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
-    uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
 
-#if !(HIGH_BIT_DEPTH)
-    uint16x8_t sum = vaddq_u16(v30, v31);
-    return vaddvq_u32(vpaddlq_u16(sum));
-#else
-    uint32x4_t sum = vpaddlq_u16(v30);
-    sum = vpadalq_u16(sum, v31);
-    return vaddvq_u32(sum);
-#endif
-}
+    out[0] = vaddq_u16(out[0], out[1]);
 
+    return vaddlvq_u16(out[0]);
+}
 
-int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
-    int16x8_t v20, v21, v22, v23;
-    sa8d_out_type res0, res1;
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
-
-#if HIGH_BIT_DEPTH
-    uint32x4_t s = vaddq_u32(res0, res1);
-    return (vaddvq_u32(s) + 1) >> 1;
-#else
-    return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1;
-#endif
-}
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                     pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
 
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
+    sum0 = vaddq_u16(sum0, sum1);
 
+    return vaddlvq_u16(sum0);
+}
 
-int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
-    int16x8_t v20, v21, v22, v23;
-    sa8d_out_type res0, res1;
-    uint32x4_t v30, v31;
+    int16x8_t diff[8];
 
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpaddlq_u16(res0);
-    v31 = vpaddlq_u16(res1);
-#else
-    v30 = vaddq_u32(res0, res1);
-#endif
+    uint16x8_t out[2];
+    hadamard_4x4_dual(diff, &out[0]);
+    hadamard_4x4_dual(diff + 4, &out[1]);
 
-    _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    out[0] = vaddq_u16(out[0], out[1]);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v31 = vaddq_u32(res0, res1);
-#endif
+    return vaddlvq_u16(out[0]);
+}
 
+static inline int pixel_satd_16x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
-                 v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x8(pix1 + 8, stride_pix1,  pix2 + 8, stride_pix2, diff + 8);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v30 = vaddq_u32(v30, res0);
-    v31 = vaddq_u32(v31, res1);
-#endif
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
-    _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
-                 v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v30 = vaddq_u32(v30, res0);
-    v31 = vaddq_u32(v31, res1);
-#endif
+    sum0 = vaddq_u16(sum0, sum1);
+
+    return vaddlvq_u16(sum0);
+}
+
+static inline int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                        const uint8_t *pix2, intptr_t stride_pix2)
+{
+    uint16x8_t sum[2], out[2];
+    int16x8_t diff[8];
+
+    load_diff_u8x16x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = out[0];
+    sum[1] = out[1];
+
+    load_diff_u8x16x4(pix1 + 4 * stride_pix1, stride_pix1,
+                      pix2 + 4 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    load_diff_u8x16x4(pix1 + 8 * stride_pix1, stride_pix1,
+                      pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    load_diff_u8x16x4(pix1 + 12 * stride_pix1, stride_pix1,
+                      pix2 + 12 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    uint32x4_t sum0 = vpaddlq_u16(sum[0]);
+    uint32x4_t sum1 = vpaddlq_u16(sum[1]);
 
-    v30 = vaddq_u32(v30, v31);
+    sum0 = vaddq_u32(sum0, sum1);
 
-    return (vaddvq_u32(v30) + 1) >> 1;
+    return vaddvq_u32(sum0);
 }
 
+static inline int pixel_sa8d_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t res[2];
 
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
 
+    return (vaddlvq_u16(vaddq_u16(res[0], res[1])) + 1) >> 1;
+}
 
+static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                        const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t res[2];
+    uint32x4_t sum0, sum1;
+
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpaddlq_u16(res[0]);
+    sum1 = vpaddlq_u16(res[1]);
+
+    load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
+
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                     pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
+
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+                     pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
 
+    sum0 = vaddq_u32(sum0, sum1);
 
+    return (vaddvq_u32(sum0) + 1) >> 1;
+}
 
+#endif // HIGH_BIT_DEPTH
 
 template<int size>
 void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
@@ -1425,7 +1431,7 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
 
 template<int w, int h>
 // Calculate sa8d in blocks of 8x8
-int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d8_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
@@ -1440,7 +1446,7 @@ int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2
 
 template<int w, int h>
 // Calculate sa8d in blocks of 16x16
-int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d16_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
@@ -1474,42 +1480,63 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
 
 
 template<int w, int h>
-// calculate satd in blocks of 4x4
 int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
-    for (int row = 0; row < h; row += 4)
-        for (int col = 0; col < w; col += 4)
-            satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
-                                        pix2 + row * stride_pix2 + col, stride_pix2);
+    if (w == 4 && h == 4) {
+        satd = pixel_satd_4x4_neon(pix1, stride_pix1, pix2, stride_pix2);
+    } else {
+        for (int row = 0; row < h; row += 8)
+            for (int col = 0; col < w; col += 4)
+                satd += pixel_satd_4x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
+    }
 
     return satd;
 }
 
 template<int w, int h>
-// calculate satd in blocks of 8x4
 int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
-    if (((w | h) & 15) == 0)
+    if (w % 16 == 0 && h % 16 == 0)
     {
         for (int row = 0; row < h; row += 16)
             for (int col = 0; col < w; col += 16)
                 satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
                                               pix2 + row * stride_pix2 + col, stride_pix2);
-
     }
-    else if (((w | h) & 7) == 0)
+    else if (w % 8 == 0 && h % 16 == 0)
+    {
+        for (int row = 0; row < h; row += 16)
+            for (int col = 0; col < w; col += 8)
+                satd += pixel_satd_8x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 16 == 0 && h % 8 == 0)
+    {
+        for (int row = 0; row < h; row += 8)
+            for (int col = 0; col < w; col += 16)
+                satd += pixel_satd_16x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 16 == 0 && h % 4 == 0)
+    {
+        for (int row = 0; row < h; row += 4)
+            for (int col = 0; col < w; col += 16)
+                satd += pixel_satd_16x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 8 == 0 && h % 8 == 0)
     {
         for (int row = 0; row < h; row += 8)
             for (int col = 0; col < w; col += 8)
                 satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
                                             pix2 + row * stride_pix2 + col, stride_pix2);
-
     }
-    else
+    else // w multiple of 8, h multiple of 4
     {
         for (int row = 0; row < h; row += 4)
             for (int col = 0; col < w; col += 8)
@@ -1634,38 +1661,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
-    p.pu[LUMA_4x4].satd   = pixel_satd_4x4_neon;
-    p.pu[LUMA_8x4].satd   = pixel_satd_8x4_neon;
-    
+    p.pu[LUMA_4x4].satd   = satd4_neon<4, 4>;
+    p.pu[LUMA_4x8].satd   = satd4_neon<4, 8>;
+    p.pu[LUMA_4x16].satd  = satd4_neon<4, 16>;
+    p.pu[LUMA_8x4].satd   = satd8_neon<8, 4>;
     p.pu[LUMA_8x8].satd   = satd8_neon<8, 8>;
-    p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
-    p.pu[LUMA_16x8].satd  = satd8_neon<16, 8>;
     p.pu[LUMA_8x16].satd  = satd8_neon<8, 16>;
-    p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+    p.pu[LUMA_8x32].satd  = satd8_neon<8, 32>;
+    p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
     p.pu[LUMA_16x4].satd  = satd8_neon<16, 4>;
-    p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
-    p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+    p.pu[LUMA_16x8].satd  = satd8_neon<16, 8>;
+    p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+    p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
     p.pu[LUMA_16x32].satd = satd8_neon<16, 32>;
-    p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+    p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
     p.pu[LUMA_24x32].satd = satd8_neon<24, 32>;
     p.pu[LUMA_32x8].satd  = satd8_neon<32, 8>;
-    p.pu[LUMA_8x32].satd  = satd8_neon<8, 32>;
-    p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
-    p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+    p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+    p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+    p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
     p.pu[LUMA_32x64].satd = satd8_neon<32, 64>;
-    p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
     p.pu[LUMA_48x64].satd = satd8_neon<48, 64>;
     p.pu[LUMA_64x16].satd = satd8_neon<64, 16>;
-    p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
-
-#if HIGH_BIT_DEPTH
-    p.pu[LUMA_4x8].satd   = satd4_neon<4, 8>;
-    p.pu[LUMA_4x16].satd  = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
-    p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+    p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+    p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
+    p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
 
 
     LUMA_CU(4, 4);
@@ -1673,7 +1693,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_CU(16, 16);
     LUMA_CU(32, 32);
     LUMA_CU(64, 64);
-    
+
 #if !(HIGH_BIT_DEPTH)
     p.cu[BLOCK_8x8].var   = pixel_var_neon<8>;
     p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
@@ -1697,17 +1717,17 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_8x8].calcresidual[ALIGNED]       = getResidual_neon<8>;
     p.cu[BLOCK_16x16].calcresidual[NONALIGNED]  = getResidual_neon<16>;
     p.cu[BLOCK_16x16].calcresidual[ALIGNED]     = getResidual_neon<16>;
-    
+
 #if defined(__APPLE__)
     p.cu[BLOCK_32x32].calcresidual[NONALIGNED]  = getResidual_neon<32>;
     p.cu[BLOCK_32x32].calcresidual[ALIGNED]     = getResidual_neon<32>;
 #endif // defined(__APPLE__)
 
-    p.cu[BLOCK_4x4].sa8d   = pixel_satd_4x4_neon;
-    p.cu[BLOCK_8x8].sa8d   = pixel_sa8d_8x8_neon;
-    p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
-    p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
-    p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
+    p.cu[BLOCK_4x4].sa8d   = satd4_neon<4, 4>;
+    p.cu[BLOCK_8x8].sa8d   = sa8d8_neon<8, 8>;
+    p.cu[BLOCK_16x16].sa8d = sa8d16_neon<16, 16>;
+    p.cu[BLOCK_32x32].sa8d = sa8d16_neon<32, 32>;
+    p.cu[BLOCK_64x64].sa8d = sa8d16_neon<64, 64>;
 
 
 #define CHROMA_PU_420(W, H) \
@@ -1743,38 +1763,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = pixel_satd_4x4_neon;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd8_neon<8, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd   = NULL;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = pixel_satd_8x4_neon;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd8_neon<16, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd8_neon<8, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd   = NULL;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4_neon<16, 12>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd4_neon<16, 4>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd8_neon<32, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd8_neon<8, 32>;
-    
-#if HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = satd4_neon<4, 4>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = satd4_neon<4, 8>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = satd8_neon<8, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd8_neon<8, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd8_neon<8, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd8_neon<8, 32>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd8_neon<16, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd8_neon<16, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd8_neon<16, 12>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd8_neon<32, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
 
 
 #define CHROMA_CU_420(W, H) \
@@ -1783,7 +1795,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>;  \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-    
+
 #define CHROMA_CU_S_420(W, H) \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1799,9 +1811,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
-    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
-    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
-    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 8>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 16>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 32>;
 
 
 #define CHROMA_PU_422(W, H) \
@@ -1837,34 +1849,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd   = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd8_neon<8, 16>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = pixel_satd_4x4_neon;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd   = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = satd4_neon<4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd4_neon<4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd4_neon<4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd4_neon<4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd8_neon<8, 4>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = satd8_neon<8, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd8_neon<8, 12>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd8_neon<8, 16>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = satd8_neon<8, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd  = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd4_neon<8, 4>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd  = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd8_neon<16, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
-    
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd4_neon<8, 12>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = satd8_neon<8, 64>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd8_neon<16, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8_neon<16, 24>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8_neon<24, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8_neon<32, 48>;
-
-#if HIGH_BIT_DEPTH
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd4_neon<4, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd4_neon<4, 16>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd4_neon<4, 32>;
-#endif // HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
 
 
 #define CHROMA_CU_422(W, H) \
@@ -1887,10 +1896,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     CHROMA_CU_422(16, 32)
     CHROMA_CU_422(32, 64)
 
-    p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
-    p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
-    p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
-    p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d       = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
+    p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d     = sa8d8_neon<8, 16>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d     = sa8d16_neon<16, 32>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d     = sa8d16_neon<32, 64>;
+
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d  = sa8d8_neon<8, 16>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = sa8d16_neon<16, 32>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = sa8d16_neon<32, 64>;
 
     p.weight_pp = weight_pp_neon;
 
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 106ba903a..856c12862 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -56,261 +56,3 @@ function PFX(pixel_sub_ps_8x16_sve)
     ret
 endfunc
 
-//******* satd *******
-.macro satd_4x4_sve
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z2.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z1.h}, p0/z, [x0]
-    ld1b            {z3.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z6.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z5.h}, p0/z, [x0]
-    ld1b            {z7.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-
-    sub             z0.h, z0.h, z2.h
-    sub             z1.h, z1.h, z3.h
-    sub             z2.h, z4.h, z6.h
-    sub             z3.h, z5.h, z7.h
-
-    add             z4.h, z0.h, z2.h
-    add             z5.h, z1.h, z3.h
-    sub             z6.h, z0.h, z2.h
-    sub             z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.h, z0.h, z2.h
-    trn2            z5.h, z0.h, z2.h
-
-    trn1            z6.h, z1.h, z3.h
-    trn2            z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.s, z0.s, z1.s
-    trn2            z5.s, z0.s, z1.s
-
-    trn1            z6.s, z2.s, z3.s
-    trn2            z7.s, z2.s, z3.s
-
-    abs             z4.h, p0/m, z4.h
-    abs             z5.h, p0/m, z5.h
-    abs             z6.h, p0/m, z6.h
-    abs             z7.h, p0/m, z7.h
-
-    smax            z4.h, p0/m, z4.h, z5.h
-    smax            z6.h, p0/m, z6.h, z7.h
-
-    add             z0.h, z4.h, z6.h
-
-    uaddlp          v0.2s, v0.4h
-    uaddlp          v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_sve)
-    ptrue           p0.h, vl4
-    satd_4x4_sve
-    fmov            x0, d0
-    ret
-endfunc
-
-function PFX(pixel_satd_8x4_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    add             x0, x4, #4
-    add             x2, x5, #4
-    umov            x6, v0.d[0]
-    satd_4x4_sve
-    umov            x0, v0.d[0]
-    add             x0, x0, x6
-    ret
-endfunc
-
-function PFX(pixel_satd_8x12_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    mov             x7, #0
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.rept 2
-    sub             x0, x0, #4
-    sub             x2, x2, #4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.endr
-    mov             x0, x7
-    ret
-endfunc
-
-.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
-    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z1.h}, p0/z, [x0, x11]
-    ld1b            {z2.h}, p0/z, [x2]
-    ld1b            {z3.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z5.h}, p0/z, [x0, x11]
-    ld1b            {z6.h}, p0/z, [x2]
-    ld1b            {z7.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v0\().h, z0.h, z2.h
-    sub             \v4\().h, z1.h, z3.h
-    sub             \v1\().h, z4.h, z6.h
-    sub             \v5\().h, z5.h, z7.h
-
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z1.h}, p0/z, [x0, x11]
-    ld1b            {z2.h}, p0/z, [x2]
-    ld1b            {z3.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z5.h}, p0/z, [x0, x11]
-    ld1b            {z6.h}, p0/z, [x2]
-    ld1b            {z7.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v2\().h, z0.h, z2.h
-    sub             \v6\().h, z1.h, z3.h
-    sub             \v3\().h, z4.h, z6.h
-    sub             \v7\().h, z5.h, z7.h
-.endm
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_sve), export=0
-    HADAMARD4_V     z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
-    HADAMARD4_V     z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
-    trn4            z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
-    trn4            z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
-    SUMSUB_ABCD     z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
-    SUMSUB_ABCD     z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
-    trn4            z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
-    trn4            z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
-    ABS8_SVE        z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
-    smax            z0.h, p0/m, z0.h, z2.h
-    smax            z1.h, p0/m, z1.h, z3.h
-    smax            z4.h, p0/m, z4.h, z6.h
-    smax            z5.h, p0/m, z5.h, z7.h
-    ret
-endfunc
-
-function PFX(satd_16x4_sve), export=0
-    LOAD_DIFF_16x4_sve  z16, z17, z18, z19, z20, z21, z22, z23
-    b                    PFX(satd_8x4v_8x8h_sve)
-endfunc
-
-.macro pixel_satd_32x8_sve
-    mov             x4, x0
-    mov             x5, x2
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h
-    add             z30.h, z30.h, z4.h
-    add             z31.h, z31.h, z5.h
-.endr
-    add             x0, x4, #16
-    add             x2, x5, #16
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h
-    add             z30.h, z30.h, z4.h
-    add             z31.h, z31.h, z5.h
-.endr
-.endm
-
-.macro satd_32x16_sve
-    movi            v30.2d, #0
-    movi            v31.2d, #0
-    pixel_satd_32x8_sve
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8_sve
-    add             z0.h, z30.h, z31.h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-.endm
-
-function PFX(pixel_satd_32x16_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    satd_32x16_sve
-    mov             x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x32_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    mov             x7, #0
-    satd_32x16_sve
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-    satd_32x16_sve
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-.macro satd_64x16_sve
-    mov             x8, x0
-    mov             x9, x2
-    satd_32x16_sve
-    add             x7, x7, x6
-    add             x0, x8, #32
-    add             x2, x9, #32
-    satd_32x16_sve
-    add             x7, x7, x6
-.endm
-
-function PFX(pixel_satd_64x48_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_64x16_sve
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_sve
-    mov             x0, x7
-    ret             x10
-endfunc
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..e189fdcd7 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,963 +565,6 @@ function PFX(scale2D_64to32_neon)
     ret
 endfunc
 
-//******* satd *******
-.macro satd_4x4_neon
-    ldr             s0, [x0]
-    ldr             s1, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1             {v0.s}[1], [x0], x1
-    ld1             {v1.s}[1], [x2], x3
-
-    ldr             s2, [x0]
-    ldr             s3, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1             {v2.s}[1], [x0], x1
-    ld1             {v3.s}[1], [x2], x3
-
-    usubl           v4.8h, v0.8b, v1.8b
-    usubl           v5.8h, v2.8b, v3.8b
-
-    add             v6.8h, v4.8h, v5.8h
-    sub             v7.8h, v4.8h, v5.8h
-
-    mov             v4.d[0], v6.d[1]
-    add             v0.4h, v6.4h, v4.4h
-    sub             v2.4h, v6.4h, v4.4h
-
-    mov             v5.d[0], v7.d[1]
-    add             v1.4h, v7.4h, v5.4h
-    sub             v3.4h, v7.4h, v5.4h
-
-    trn1            v4.4h, v0.4h, v1.4h
-    trn2            v5.4h, v0.4h, v1.4h
-
-    trn1            v6.4h, v2.4h, v3.4h
-    trn2            v7.4h, v2.4h, v3.4h
-
-    add             v0.4h, v4.4h, v5.4h
-    sub             v1.4h, v4.4h, v5.4h
-
-    add             v2.4h, v6.4h, v7.4h
-    sub             v3.4h, v6.4h, v7.4h
-
-    trn1            v4.2s, v0.2s, v1.2s
-    trn2            v5.2s, v0.2s, v1.2s
-
-    trn1            v6.2s, v2.2s, v3.2s
-    trn2            v7.2s, v2.2s, v3.2s
-
-    abs             v4.4h, v4.4h
-    abs             v5.4h, v5.4h
-    abs             v6.4h, v6.4h
-    abs             v7.4h, v7.4h
-
-    smax            v1.4h, v4.4h, v5.4h
-    smax            v2.4h, v6.4h, v7.4h
-
-    add             v0.4h, v1.4h, v2.4h
-    uaddlp          v0.2s, v0.4h
-    uaddlp          v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_neon)
-    satd_4x4_neon
-    fmov            x0, d0
-    ret
-endfunc
-
-.macro x265_satd_4x8_8x4_end_neon
-    add             v0.8h, v4.8h, v6.8h
-    add             v1.8h, v5.8h, v7.8h
-    sub             v2.8h, v4.8h, v6.8h
-    sub             v3.8h, v5.8h, v7.8h
-
-    trn1            v16.8h, v0.8h, v1.8h
-    trn2            v17.8h, v0.8h, v1.8h
-    add             v4.8h, v16.8h, v17.8h
-    trn1            v18.8h, v2.8h, v3.8h
-    trn2            v19.8h, v2.8h, v3.8h
-    sub             v5.8h, v16.8h, v17.8h
-    add             v6.8h, v18.8h, v19.8h
-    sub             v7.8h, v18.8h, v19.8h
-    trn1            v0.4s, v4.4s, v6.4s
-    trn2            v2.4s, v4.4s, v6.4s
-    abs             v0.8h, v0.8h
-    trn1            v1.4s, v5.4s, v7.4s
-    trn2            v3.4s, v5.4s, v7.4s
-    abs             v2.8h, v2.8h
-    abs             v1.8h, v1.8h
-    abs             v3.8h, v3.8h
-    umax            v0.8h, v0.8h, v2.8h
-    umax            v1.8h, v1.8h, v3.8h
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-.endm
-
-.macro pixel_satd_4x8_neon
-    ld1r            {v1.2s}, [x2], x3
-    ld1r            {v0.2s}, [x0], x1
-    ld1r            {v3.2s}, [x2], x3
-    ld1r            {v2.2s}, [x0], x1
-    ld1r            {v5.2s}, [x2], x3
-    ld1r            {v4.2s}, [x0], x1
-    ld1r            {v7.2s}, [x2], x3
-    ld1r            {v6.2s}, [x0], x1
-
-    ld1             {v1.s}[1], [x2], x3
-    ld1             {v0.s}[1], [x0], x1
-    usubl           v0.8h, v0.8b, v1.8b
-    ld1             {v3.s}[1], [x2], x3
-    ld1             {v2.s}[1], [x0], x1
-    usubl           v1.8h, v2.8b, v3.8b
-    ld1             {v5.s}[1], [x2], x3
-    ld1             {v4.s}[1], [x0], x1
-    usubl           v2.8h, v4.8b, v5.8b
-    ld1             {v7.s}[1], [x2], x3
-    add             v4.8h, v0.8h, v1.8h
-    sub             v5.8h, v0.8h, v1.8h
-    ld1             {v6.s}[1], [x0], x1
-    usubl           v3.8h, v6.8b, v7.8b
-    add             v6.8h, v2.8h, v3.8h
-    sub             v7.8h, v2.8h, v3.8h
-    x265_satd_4x8_8x4_end_neon
-.endm
-
-// template<int w, int h>
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x8_neon)
-    pixel_satd_4x8_neon
-    mov             w0, v0.s[0]
-    ret
-endfunc
-
-function PFX(pixel_satd_4x16_neon)
-    mov             w4, #0
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w4, w4, w5
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w0, w5, w4
-    ret
-endfunc
-
-function PFX(pixel_satd_4x32_neon)
-    mov             w4, #0
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w4, w4, w5
-.endr
-    mov             w0, w4
-    ret
-endfunc
-
-function PFX(pixel_satd_12x16_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             w7, #0
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-
-    add             x0, x4, #4
-    add             x2, x5, #4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-
-    add             x0, x4, #8
-    add             x2, x5, #8
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w0, w7, w6
-    ret
-endfunc
-
-function PFX(pixel_satd_12x32_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             w7, #0
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    add             x0, x4, #4
-    add             x2, x5, #4
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    add             x0, x4, #8
-    add             x2, x5, #8
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    mov             w0, w7
-    ret
-endfunc
-
-function PFX(pixel_satd_8x4_neon)
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_neon
-    add             x0, x4, #4
-    add             x2, x5, #4
-    umov            x6, v0.d[0]
-    satd_4x4_neon
-    umov            x0, v0.d[0]
-    add             x0, x0, x6
-    ret
-endfunc
-
-.macro LOAD_DIFF_8x4 v0 v1 v2 v3
-    ld1             {v0.8b}, [x0], x1
-    ld1             {v1.8b}, [x2], x3
-    ld1             {v2.8b}, [x0], x1
-    ld1             {v3.8b}, [x2], x3
-    ld1             {v4.8b}, [x0], x1
-    ld1             {v5.8b}, [x2], x3
-    ld1             {v6.8b}, [x0], x1
-    ld1             {v7.8b}, [x2], x3
-    usubl           \v0, v0.8b, v1.8b
-    usubl           \v1, v2.8b, v3.8b
-    usubl           \v2, v4.8b, v5.8b
-    usubl           \v3, v6.8b, v7.8b
-.endm
-
-.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
-    ld1             {v0.16b}, [x0], x1
-    ld1             {v1.16b}, [x2], x3
-    ld1             {v2.16b}, [x0], x1
-    ld1             {v3.16b}, [x2], x3
-    ld1             {v4.16b}, [x0], x1
-    ld1             {v5.16b}, [x2], x3
-    ld1             {v6.16b}, [x0], x1
-    ld1             {v7.16b}, [x2], x3
-    usubl           \v0, v0.8b, v1.8b
-    usubl           \v1, v2.8b, v3.8b
-    usubl           \v2, v4.8b, v5.8b
-    usubl           \v3, v6.8b, v7.8b
-    usubl2          \v4, v0.16b, v1.16b
-    usubl2          \v5, v2.16b, v3.16b
-    usubl2          \v6, v4.16b, v5.16b
-    usubl2          \v7, v6.16b, v7.16b
-.endm
-
-function PFX(satd_16x4_neon), export=0
-    LOAD_DIFF_16x4  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    b               PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-function PFX(satd_8x8_neon), export=0
-    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
-    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
-    b               PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_neon), export=0
-    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
-    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    trn4            v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
-    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    SUMSUB_ABCD     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    SUMSUB_ABCD     v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
-    trn4            v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
-    trn4            v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
-    ABS8            v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
-    smax            v0.8h, v0.8h, v2.8h
-    smax            v1.8h, v1.8h, v3.8h
-    smax            v2.8h, v4.8h, v6.8h
-    smax            v3.8h, v5.8h, v7.8h
-    ret
-endfunc
-
-function PFX(pixel_satd_8x8_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    add             v1.8h, v2.8h, v3.8h
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x12_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             x7, #0
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.rept 2
-    sub             x0, x0, #4
-    sub             x2, x2, #4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.endr
-    mov             x0, x7
-    ret
-endfunc
-
-function PFX(pixel_satd_8x16_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x32_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 3
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x64_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 7
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x4_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x8_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x12_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x16_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 3
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x24_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 5
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-.macro pixel_satd_16x32_neon
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 7
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-function PFX(pixel_satd_16x32_neon)
-    mov             x10, x30
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x64_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 15
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_24x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_24x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    sub             x4, x4, #24
-    sub             x5, x5, #24
-    add             x0, x4, x1, lsl #5
-    add             x2, x5, x3, lsl #5
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    mov             x0, x7
-    ret             x10
-endfunc
-
-.macro pixel_satd_32x8
-    mov             x4, x0
-    mov             x5, x2
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             x0, x4, #16
-    add             x2, x5, #16
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-.macro satd_32x16_neon
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_32x8
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-.endm
-
-.macro satd_64x16_neon
-    mov             x8, x0
-    mov             x9, x2
-    satd_32x16_neon
-    add             x7, x7, x6
-    add             x0, x8, #32
-    add             x2, x9, #32
-    satd_32x16_neon
-    add             x7, x7, x6
-.endm
-
-function PFX(pixel_satd_32x8_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x16_neon)
-    mov             x10, x30
-    satd_32x16_neon
-    mov             x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x24_neon)
-    mov             x10, x30
-    satd_32x16_neon
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    add             x0, x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x48_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 3
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x16_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x48_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 3
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_48x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x8, x0
-    mov             x9, x2
-.rept 3
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x7, x7, x6
-
-    add             x0, x8, #32
-    add             x2, x9, #32
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(sa8d_8x8_neon), export=0
-    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
-    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
-    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
-    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    SUMSUB_ABCD     v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
-    SUMSUB_ABCD     v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
-    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    trn4            v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
-    SUMSUB_ABCD     v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
-    SUMSUB_ABCD     v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
-    trn4            v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
-    trn4            v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
-    SUMSUB_ABCD     v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
-    SUMSUB_ABCD     v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
-    trn4            v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
-    trn4            v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
-    ABS8            v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    smax            v16.8h, v16.8h, v20.8h
-    smax            v17.8h, v17.8h, v21.8h
-    smax            v18.8h, v18.8h, v22.8h
-    smax            v19.8h, v19.8h, v23.8h
-    add             v0.8h, v16.8h, v17.8h
-    add             v1.8h, v18.8h, v19.8h
-    ret
-endfunc
-
-function PFX(pixel_sa8d_8x8_neon)
-    mov             x10, x30
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    add             w0, w0, #1
-    lsr             w0, w0, #1
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_8x16_neon)
-    mov             x10, x30
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w5, v0.s[0]
-    add             w5, w5, #1
-    lsr             w5, w5, #1
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w4, v0.s[0]
-    add             w4, w4, #1
-    lsr             w4, w4, #1
-    add             w0, w4, w5
-    ret             x10
-endfunc
-
-.macro sa8d_16x16 reg
-    bl              PFX(sa8d_8x8_neon)
-    uaddlp          v30.4s, v0.8h
-    uaddlp          v31.4s, v1.8h
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    add             v0.4s, v30.4s, v31.4s
-    addv            s0, v0.4s
-    mov             \reg, v0.s[0]
-    add             \reg, \reg, #1
-    lsr             \reg, \reg, #1
-.endm
-
-function PFX(pixel_sa8d_16x16_neon)
-    mov             x10, x30
-    sa8d_16x16      w0
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_16x32_neon)
-    mov             x10, x30
-    sa8d_16x16      w4
-    sub             x0, x0, #8
-    sub             x2, x2, #8
-    sa8d_16x16      w5
-    add             w0, w4, w5
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_32x32_neon)
-    mov             x10, x30
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    sub             x0, x0, #24
-    sub             x2, x2, #24
-    sa8d_16x16      w6
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w7
-    add             w4, w4, w5
-    add             w6, w6, w7
-    add             w0, w4, w6
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_32x64_neon)
-    mov             x10, x30
-    mov             w11, #4
-    mov             w9, #0
-.Loop_sa8d_32:
-    sub             w11, w11, #1
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    add             w4, w4, w5
-    add             w9, w9, w4
-    sub             x0, x0, #24
-    sub             x2, x2, #24
-    cbnz            w11, .Loop_sa8d_32
-    mov             w0, w9
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_64x64_neon)
-    mov             x10, x30
-    mov             w11, #4
-    mov             w9, #0
-.Loop_sa8d_64:
-    sub             w11, w11, #1
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w6
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w7
-    add             w4, w4, w5
-    add             w6, w6, w7
-    add             w8, w4, w6
-    add             w9, w9, w8
-
-    sub             x0, x0, #56
-    sub             x2, x2, #56
-    cbnz            w11, .Loop_sa8d_64
-    mov             w0, w9
-    ret             x10
-endfunc
-
 /***** dequant_scaling*****/
 // void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
 function PFX(dequant_scaling_neon)
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From a819aca07e207879224481058d8eb7a41a07b300 Mon Sep 17 00:00:00 2001
Message-Id: <a819aca07e207879224481058d8eb7a41a07b300.1746034801.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1746034801.git.li.zhang2 at arm.com>
References: <cover.1746034801.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Wed, 30 Apr 2025 19:29:01 +0200
Subject: [PATCH 1/3] AArch64: Clean up satd/sa8d functions

Clean up and optimize the Neon intrinsics implementation of the
satd/sa8d primitives for all bitdepths.

Remove the Neon and SVE assembly implementations of these primitives
since they are now slower than the Neon intrinsics implementations.
---
 source/common/aarch64/asm-primitives.cpp |   76 --
 source/common/aarch64/mem-neon.h         |    6 +-
 source/common/aarch64/pixel-prim.cpp     | 1407 +++++++++++-----------
 source/common/aarch64/pixel-util-sve.S   |  258 ----
 source/common/aarch64/pixel-util.S       |  957 ---------------
 5 files changed, 712 insertions(+), 1992 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6097f7655..4d2c575d1 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -652,64 +652,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
     p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
 
-    // satd
-    ALL_LUMA_PU(satd, pixel_satd, neon);
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = PFX(pixel_satd_8x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = PFX(pixel_satd_8x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = PFX(pixel_satd_4x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = PFX(pixel_satd_16x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = PFX(pixel_satd_8x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = PFX(pixel_satd_16x4_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = PFX(pixel_satd_4x16_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = PFX(pixel_satd_32x8_neon);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = PFX(pixel_satd_8x32_neon);
-
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = PFX(pixel_satd_4x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = PFX(pixel_satd_8x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = PFX(pixel_satd_8x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = PFX(pixel_satd_4x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = PFX(pixel_satd_8x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = PFX(pixel_satd_8x12_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = PFX(pixel_satd_8x4_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = PFX(pixel_satd_16x8_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = PFX(pixel_satd_4x32_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = PFX(pixel_satd_8x64_neon);
-
-    // sa8d
-    p.cu[BLOCK_4x4].sa8d   = PFX(pixel_satd_4x4_neon);
-    p.cu[BLOCK_8x8].sa8d   = PFX(pixel_sa8d_8x8_neon);
-    p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
-    p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
-    p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
-    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
     // dequant_scaling
     p.dequant_scaling = PFX(dequant_scaling_neon);
 
@@ -857,24 +799,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
 
 #if !HIGH_BIT_DEPTH
     p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps  = PFX(pixel_sub_ps_8x16_sve);
-
-    // satd
-    p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
-    p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_sve);
-    p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
-    p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = PFX(pixel_satd_8x4_sve);
-
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = PFX(pixel_satd_8x12_sve);
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = PFX(pixel_satd_8x4_sve);
-
-    // sa8d
-    p.cu[BLOCK_4x4].sa8d   = PFX(pixel_satd_4x4_sve);
-    p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
 #else // HIGH_BIT_DEPTH
     // sse_pp
     p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_sve);
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 263c1d569..8bd5fbee9 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -106,8 +106,7 @@ static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
 {
     for (int i = 0; i < N; ++i)
     {
-        dst[i] = vld1q_u8(src);
-        src += stride;
+        dst[i] = vld1q_u8(src + i * stride);
     }
 }
 
@@ -230,8 +229,7 @@ static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
 {
     for (int i = 0; i < N; ++i)
     {
-        dst[i] = vld1q_u16(src);
-        src += stride;
+        dst[i] = vld1q_u16(src + i * stride);
     }
 }
 
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 15ccdff22..67c388b59 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -19,799 +19,805 @@ namespace
 {
 
 
-/* SATD SA8D variants - based on x264 */
-static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+static inline void sumsubq_s16(int16x8_t *sum, int16x8_t *sub, const int16x8_t a, const int16x8_t b)
 {
-    sum = vaddq_s16(a, b);
-    sub = vsubq_s16(a, b);
+    *sum = vaddq_s16(a, b);
+    *sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s16x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
-    t1 = vtrn1q_s16(s1, s2);
-    t2 = vtrn2q_s16(s1, s2);
+    *t1 = vtrn1q_s16(s1, s2);
+    *t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s32x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
     int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
 
-    t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
 }
 
-static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
-                                   const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s64x2(int16x8_t *t1, int16x8_t *t2,
+                                       const int16x8_t s1, const int16x8_t s2)
 {
     int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
     int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
 
-    t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
-    t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+    *t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
 }
 
-static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
-                               int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
+static inline uint16x8_t max_abs_s16(const int16x8_t a, const int16x8_t b)
 {
-    SUMSUB_AB(s1, d1, a, b);
-    SUMSUB_AB(s2, d2, c, d);
+    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(a));
+    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(b));
+
+    return vmaxq_u16(abs0, abs1);
 }
 
-static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
-                               int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+#if X265_DEPTH == 12
+static inline void sumsubq_s32(int32x4_t *sum, int32x4_t *sub, const int32x4_t a, const int32x4_t b)
 {
-    SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
-    SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+    *sum = vaddq_s32(a, b);
+    *sub = vsubq_s32(a, b);
 }
 
-
-static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
-
+static inline void sumsublq_s16(int32x4_t *sum_lo, int32x4_t *sum_hi,
+                                int32x4_t *sub_lo, int32x4_t *sub_hi,
+                                const int16x8_t a, const int16x8_t b)
 {
+    *sum_lo = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sub_lo = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+    *sum_hi = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+    *sub_hi = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+}
 
-    int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
-
-
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
-
-    SUMSUB_AB(v4 , v6 , v16, v18);
-    SUMSUB_AB(v5 , v7 , v17, v19);
-
-    transpose_8h_8h(v0, v1, v4, v5);
-    transpose_8h_8h(v2, v3, v6, v7);
+static inline void transpose_inplace_s32_s64x2(int32x4_t *t1, int32x4_t *t2)
+{
+    int64x2_t tmp1 = vreinterpretq_s64_s32(*t1);
+    int64x2_t tmp2 = vreinterpretq_s64_s32(*t2);
 
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
+    *t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+    *t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+}
 
-    transpose_4s_8h(v0, v1, v16, v18);
-    transpose_4s_8h(v2, v3, v17, v19);
+static inline uint32x4_t max_abs_s32(int32x4_t a, int32x4_t b)
+{
+    uint32x4_t abs0 = vreinterpretq_u32_s32(vabsq_s32(a));
+    uint32x4_t abs1 = vreinterpretq_u32_s32(vabsq_s32(b));
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+    return vmaxq_u32(abs0, abs1);
+}
 
-    uint16x8_t max0 = vmaxq_u16(abs0, abs1);
-    uint16x8_t max1 = vmaxq_u16(abs2, abs3);
+#endif // X265_DEPTH == 12
 
-    uint16x8_t sum = vaddq_u16(max0, max1);
-    return vaddlvq_u16(sum);
+#if HIGH_BIT_DEPTH
+static inline void load_diff_u16x8x4(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+    uint16x8_t r[4], t[4];
+    load_u16x8xn<4>(pix1, stride_pix1, r);
+    load_u16x8xn<4>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
 }
 
-static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+static inline void load_diff_u16x8x4_dual(const uint16_t *pix1, intptr_t stride_pix1,
+                                          const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
 {
-    int16x8_t v2, v3;
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_2d_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
-
-    transpose_8h_8h(v0, v1, v2, v3);
-    SUMSUB_AB(v2,  v3,  v0,  v1);
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+                      pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
+}
 
-    transpose_4s_8h(v0, v1, v2, v3);
+static inline void load_diff_u16x8x8(const uint16_t *pix1, intptr_t stride_pix1,
+                                     const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+    uint16x8_t r[8], t[8];
+    load_u16x8xn<8>(pix1, stride_pix1, r);
+    load_u16x8xn<8>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
+    diff[4] = vreinterpretq_s16_u16(vsubq_u16(r[4], t[4]));
+    diff[5] = vreinterpretq_s16_u16(vsubq_u16(r[5], t[5]));
+    diff[6] = vreinterpretq_s16_u16(vsubq_u16(r[6], t[6]));
+    diff[7] = vreinterpretq_s16_u16(vsubq_u16(r[7], t[7]));
+}
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t max = vmaxq_u16(abs0, abs1);
+#else // !HIGH_BIT_DEPTH
+static inline void load_diff_u8x8x4(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+    uint8x8_t r[4], t[4];
+    load_u8x8xn<4>(pix1, stride_pix1, r);
+    load_u8x8xn<4>(pix2, stride_pix2, t);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(r[0], t[0]));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(r[1], t[1]));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(r[2], t[2]));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(r[3], t[3]));
+}
 
-    return vaddlvq_u16(max);
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+                                    const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+                     pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
 }
 
-static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
-                                 int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline void load_diff_u8x16x4(const uint8_t *pix1, intptr_t stride_pix1,
+                                     const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
 {
-    int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+    uint8x16_t s1[4], s2[4];
+    load_u8x16xn<4>(pix1, stride_pix1, s1);
+    load_u8x16xn<4>(pix2, stride_pix2, s2);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[0]), vget_low_u8(s2[0])));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[1]), vget_low_u8(s2[1])));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[2]), vget_low_u8(s2[2])));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[3]), vget_low_u8(s2[3])));
+    diff[4] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[0]), vget_high_u8(s2[0])));
+    diff[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[1]), vget_high_u8(s2[1])));
+    diff[6] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[2]), vget_high_u8(s2[2])));
+    diff[7] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[3]), vget_high_u8(s2[3])));
+}
 
-    SUMSUB_AB(v16, v18, v0,  v2);
-    SUMSUB_AB(v17, v19, v1,  v3);
+#endif // HIGH_BIT_DEPTH
 
-    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+// 4 way hadamard vertical pass.
+static inline void hadamard_4_v(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
+{
+    int16x8_t s0, s1, d0, d1;
 
-    transpose_8h_8h(v0,  v1,  v16, v17);
-    transpose_8h_8h(v2,  v3,  v18, v19);
-    transpose_8h_8h(v4,  v5,  v20, v21);
-    transpose_8h_8h(v6,  v7,  v22, v23);
+    sumsubq_s16(&s0, &d0, in_coefs[0], in_coefs[1]);
+    sumsubq_s16(&s1, &d1, in_coefs[2], in_coefs[3]);
 
-    SUMSUB_AB(v16, v17, v0,  v1);
-    SUMSUB_AB(v18, v19, v2,  v3);
-    SUMSUB_AB(v20, v21, v4,  v5);
-    SUMSUB_AB(v22, v23, v6,  v7);
+    sumsubq_s16(&out_coefs[0], &out_coefs[2], s0, s1);
+    sumsubq_s16(&out_coefs[1], &out_coefs[3], d0, d1);
+}
 
-    transpose_4s_8h(v0,  v2,  v16, v18);
-    transpose_4s_8h(v1,  v3,  v17, v19);
-    transpose_4s_8h(v4,  v6,  v20, v22);
-    transpose_4s_8h(v5,  v7,  v21, v23);
+// 8 way hadamard vertical pass.
+static inline void hadamard_8_v(const int16x8_t in_coefs[8], int16x8_t out_coefs[8])
+{
+    int16x8_t temp[8];
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
-    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
-    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
-    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
-    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+    hadamard_4_v(in_coefs, temp);
+    hadamard_4_v(in_coefs + 4, temp + 4);
 
-    v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
-    v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
-    v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
-    v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
+    sumsubq_s16(&out_coefs[0], &out_coefs[4], temp[0], temp[4]);
+    sumsubq_s16(&out_coefs[1], &out_coefs[5], temp[1], temp[5]);
+    sumsubq_s16(&out_coefs[2], &out_coefs[6], temp[2], temp[6]);
+    sumsubq_s16(&out_coefs[3], &out_coefs[7], temp[3], temp[7]);
 }
 
-#if HIGH_BIT_DEPTH
-
-#if (X265_DEPTH > 10)
-static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
-                                   const int32x4_t s1, const int32x4_t s2)
+// 4 way hadamard horizontal pass.
+static inline void hadamard_4_h(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
 {
-    int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
-    int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+    int16x8_t s0, s1, d0, d1, t0, t1, t2, t3;
+
+    transpose_s16_s16x2(&t0, &t1, in_coefs[0], in_coefs[1]);
+    transpose_s16_s16x2(&t2, &t3, in_coefs[2], in_coefs[3]);
+
+    sumsubq_s16(&s0, &d0, t0, t1);
+    sumsubq_s16(&s1, &d1, t2, t3);
 
-    t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
-    t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+    transpose_s16_s32x2(&out_coefs[0], &out_coefs[1], s0, s1);
+    transpose_s16_s32x2(&out_coefs[2], &out_coefs[3], d0, d1);
 }
 
-static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+#if X265_DEPTH != 12
+// 8 way hadamard horizontal pass.
+static inline void hadamard_8_h(int16x8_t coefs[8], uint16x8_t out[4])
 {
-    sum = vaddq_s32(a, b);
-    sub = vsubq_s32(a, b);
+    int16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    int16x8_t temp[8];
+
+    hadamard_4_h(coefs, temp);
+    hadamard_4_h(coefs + 4, temp + 4);
+
+    sumsubq_s16(&s0, &d0, temp[0], temp[1]);
+    sumsubq_s16(&s1, &d1, temp[2], temp[3]);
+    sumsubq_s16(&s2, &d2, temp[4], temp[5]);
+    sumsubq_s16(&s3, &d3, temp[6], temp[7]);
+
+    transpose_s16_s64x2(&temp[0], &temp[1], s0, s2);
+    transpose_s16_s64x2(&temp[2], &temp[3], s1, s3);
+    transpose_s16_s64x2(&temp[4], &temp[5], d0, d2);
+    transpose_s16_s64x2(&temp[6], &temp[7], d1, d3);
+
+    out[0] = max_abs_s16(temp[0], temp[1]);
+    out[1] = max_abs_s16(temp[2], temp[3]);
+    out[2] = max_abs_s16(temp[4], temp[5]);
+    out[3] = max_abs_s16(temp[6], temp[7]);
 }
 
-static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
-        const int16x8_t a, const int16x8_t b)
+#else // X265_DEPTH == 12
+static inline void hadamard_8_h(int16x8_t coefs[8], uint32x4_t out[4])
 {
-    suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-    sumh = vaddl_high_s16(a, b);
-    subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-    subh = vsubl_high_s16(a, b);
+    int16x8_t a[8];
+
+    transpose_s16_s16x2(&a[0], &a[1], coefs[0], coefs[1]);
+    transpose_s16_s16x2(&a[2], &a[3], coefs[2], coefs[3]);
+    transpose_s16_s16x2(&a[4], &a[5], coefs[4], coefs[5]);
+    transpose_s16_s16x2(&a[6], &a[7], coefs[6], coefs[7]);
+
+    int32x4_t a_lo[8], a_hi[8], b_lo[8], b_hi[8];
+
+    sumsublq_s16(&a_lo[0], &a_hi[0], &a_lo[4], &a_hi[4], a[0], a[1]);
+    sumsublq_s16(&a_lo[1], &a_hi[1], &a_lo[5], &a_hi[5], a[2], a[3]);
+    sumsublq_s16(&a_lo[2], &a_hi[2], &a_lo[6], &a_hi[6], a[4], a[5]);
+    sumsublq_s16(&a_lo[3], &a_hi[3], &a_lo[7], &a_hi[7], a[6], a[7]);
+
+    transpose_inplace_s32_s64x2(&a_lo[0], &a_lo[1]);
+    transpose_inplace_s32_s64x2(&a_lo[2], &a_lo[3]);
+    transpose_inplace_s32_s64x2(&a_lo[4], &a_lo[5]);
+    transpose_inplace_s32_s64x2(&a_lo[6], &a_lo[7]);
+
+    transpose_inplace_s32_s64x2(&a_hi[0], &a_hi[1]);
+    transpose_inplace_s32_s64x2(&a_hi[2], &a_hi[3]);
+    transpose_inplace_s32_s64x2(&a_hi[4], &a_hi[5]);
+    transpose_inplace_s32_s64x2(&a_hi[6], &a_hi[7]);
+
+    sumsubq_s32(&b_lo[0], &b_lo[1], a_lo[0], a_lo[1]);
+    sumsubq_s32(&b_lo[2], &b_lo[3], a_lo[2], a_lo[3]);
+    sumsubq_s32(&b_lo[4], &b_lo[5], a_lo[4], a_lo[5]);
+    sumsubq_s32(&b_lo[6], &b_lo[7], a_lo[6], a_lo[7]);
+
+    sumsubq_s32(&b_hi[0], &b_hi[1], a_hi[0], a_hi[1]);
+    sumsubq_s32(&b_hi[2], &b_hi[3], a_hi[2], a_hi[3]);
+    sumsubq_s32(&b_hi[4], &b_hi[5], a_hi[4], a_hi[5]);
+    sumsubq_s32(&b_hi[6], &b_hi[7], a_hi[6], a_hi[7]);
+
+    uint32x4_t max0_lo = max_abs_s32(b_lo[0], b_hi[0]);
+    uint32x4_t max1_lo = max_abs_s32(b_lo[1], b_hi[1]);
+    uint32x4_t max2_lo = max_abs_s32(b_lo[2], b_hi[2]);
+    uint32x4_t max3_lo = max_abs_s32(b_lo[3], b_hi[3]);
+    uint32x4_t max0_hi = max_abs_s32(b_lo[4], b_hi[4]);
+    uint32x4_t max1_hi = max_abs_s32(b_lo[5], b_hi[5]);
+    uint32x4_t max2_hi = max_abs_s32(b_lo[6], b_hi[6]);
+    uint32x4_t max3_hi = max_abs_s32(b_lo[7], b_hi[7]);
+
+    out[0] = vaddq_u32(max0_lo, max0_hi);
+    out[1] = vaddq_u32(max1_lo, max1_hi);
+    out[2] = vaddq_u32(max2_lo, max2_hi);
+    out[3] = vaddq_u32(max3_lo, max3_hi);
 }
 
-#endif
+#endif // X265_DEPTH != 12
 
-static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
-                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
-                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int hadamard_4x4(int16x8_t a0, int16x8_t a1)
 {
-    uint16x8_t r0, r1, r2, r3;
-    uint16x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17;
-    int16x8_t v18, v19;
-
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1);
-
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2);
-
-    v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    int16x8_t sum, dif, t0, t1;
+    sumsubq_s16(&sum, &dif, a0, a1);
 
-    r0 = vld1q_u16(pix1 + 4 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 5 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 6 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 7 * stride_pix1);
+    transpose_s16_s64x2(&t0, &t1, sum, dif);
+    sumsubq_s16(&sum, &dif, t0, t1);
 
-    t0 = vld1q_u16(pix2 + 4 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 5 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 6 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 7 * stride_pix2);
+    transpose_s16_s16x2(&t0, &t1, sum, dif);
+    sumsubq_s16(&sum, &dif, t0, t1);
 
-    v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    transpose_s16_s32x2(&t0, &t1, sum, dif);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    uint16x8_t max = max_abs_s16(t0, t1);
 
+    return vaddlvq_u16(max);
 }
 
-
-
-
-static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
-                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+// Calculate 2 4x4 hadamard transformation.
+static void hadamard_4x4_dual(int16x8_t diff[4], uint16x8_t *out)
 {
-    uint16x8_t r0, r1, r2, r3;
-    uint16x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17, v20, v21;
-    int16x8_t v18, v19, v22, v23;
+    int16x8_t temp[4];
 
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1);
+    hadamard_4_v(diff, temp);
+    hadamard_4_h(temp, diff);
 
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2);
+    uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+    uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
 
-    v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
-
-    r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8);
-    r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8);
-    r2 = vld1q_u16(pix1 + 2 * stride_pix1 + 8);
-    r3 = vld1q_u16(pix1 + 3 * stride_pix1 + 8);
+    *out = vaddq_u16(sum0, sum1);
+}
 
-    t0 = vld1q_u16(pix2 + 0 * stride_pix2 + 8);
-    t1 = vld1q_u16(pix2 + 1 * stride_pix2 + 8);
-    t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8);
-    t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8);
+// Calculate 4 4x4 hadamard transformation.
+static inline void hadamard_4x4_quad(int16x8_t diff[8], uint16x8_t out[2])
+{
+    int16x8_t temp[8];
 
-    v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+    hadamard_4_v(diff, temp);
+    hadamard_4_v(diff + 4, temp + 4);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    hadamard_4_h(temp, diff);
+    hadamard_4_h(temp + 4, diff + 4);
 
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+    uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+    uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
+    uint16x8_t sum2 = max_abs_s16(diff[4], diff[5]);
+    uint16x8_t sum3 = max_abs_s16(diff[6], diff[7]);
 
+    out[0] = vaddq_u16(sum0, sum1);
+    out[1] = vaddq_u16(sum2, sum3);
 }
 
-
-int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if X265_DEPTH == 8
+static inline void hadamard_8x8(int16x8_t diff[8], uint16x8_t out[2])
 {
-    uint16x4_t t0_0 = vld1_u16(pix1 + 0 * stride_pix1);
-    uint16x4_t t1_0 = vld1_u16(pix1 + 1 * stride_pix1);
-    uint16x4_t t0_1 = vld1_u16(pix1 + 2 * stride_pix1);
-    uint16x4_t t1_1 = vld1_u16(pix1 + 3 * stride_pix1);
-    uint16x8_t t0 = vcombine_u16(t0_0, t0_1);
-    uint16x8_t t1 = vcombine_u16(t1_0, t1_1);
+    int16x8_t temp[8];
+    uint16x8_t sum[4];
 
-    uint16x4_t r0_0 = vld1_u16(pix2 + 0 * stride_pix2);
-    uint16x4_t r1_0 = vld1_u16(pix2 + 1 * stride_pix2);
-    uint16x4_t r0_1 = vld1_u16(pix2 + 2 * stride_pix2);
-    uint16x4_t r1_1 = vld1_u16(pix2 + 3 * stride_pix2);
-    uint16x8_t r0 = vcombine_u16(r0_0, r0_1);
-    uint16x8_t r1 = vcombine_u16(r1_0, r1_1);
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-
-    return _satd_4x4_neon(v0, v1);
+    out[0] = vaddq_u16(sum[0], sum[1]);
+    out[1] = vaddq_u16(sum[2], sum[3]);
 }
 
+#elif X265_DEPTH == 10
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
+{
+    int16x8_t temp[8];
+    uint16x8_t sum[4];
 
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
+    out[0] = vpaddlq_u16(sum[0]);
+    out[1] = vpaddlq_u16(sum[1]);
+    out[0] = vpadalq_u16(out[0], sum[2]);
+    out[1] = vpadalq_u16(out[1], sum[3]);
+}
 
-
-
-int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#elif X265_DEPTH == 12
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
 {
-    uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
-
-    i0 = vld1q_u16(pix1 + 0 * stride_pix1);
-    i1 = vld1q_u16(pix2 + 0 * stride_pix2);
-    i2 = vld1q_u16(pix1 + 1 * stride_pix1);
-    i3 = vld1q_u16(pix2 + 1 * stride_pix2);
-    i4 = vld1q_u16(pix1 + 2 * stride_pix1);
-    i5 = vld1q_u16(pix2 + 2 * stride_pix2);
-    i6 = vld1q_u16(pix1 + 3 * stride_pix1);
-    i7 = vld1q_u16(pix2 + 3 * stride_pix2);
+    int16x8_t temp[8];
+    uint32x4_t sum[4];
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3));
-    int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5));
-    int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7));
+    hadamard_8_v(diff, temp);
+    hadamard_8_h(temp, sum);
 
-    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+    out[0] = vaddq_u32(sum[0], sum[1]);
+    out[1] = vaddq_u32(sum[2], sum[3]);
 }
 
+#endif // X265_DEPTH == 8
 
-int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
-    int16x8_t v0, v1, v2, v3;
+    uint16x4_t s[4], r[4];
+    load_u16x4xn<4>(pix1, stride_pix1, s);
+    load_u16x4xn<4>(pix2, stride_pix2, r);
 
-    for (int offset = 0; offset <= 12; offset += 4)
-    {
-        _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1,
-                        pix2 + offset * stride_pix2,stride_pix2,
-                        v0, v1, v2, v3);
-        v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0));
-        v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1));
-        v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2));
-        v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3));
-    }
+    uint16x8_t s0 = vcombine_u16(s[0], s[2]);
+    uint16x8_t s1 = vcombine_u16(s[1], s[3]);
+    uint16x8_t r0 = vcombine_u16(r[0], r[2]);
+    uint16x8_t r1 = vcombine_u16(r[1], r[3]);
+
+    int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
 
-    return vaddvq_u32(vaddq_u32(v30, v31));
+    return hadamard_4x4(diff0, diff1);
 }
 
-#else       //HIGH_BIT_DEPTH
+static inline int pixel_satd_4x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[4];
+
+    uint16x4_t s[8], r[8];
+    load_u16x4xn<8>(pix1, stride_pix1, s);
+    load_u16x4xn<8>(pix2, stride_pix2, r);
+
+    uint16x8_t s0 = vcombine_u16(s[0], s[4]);
+    uint16x8_t s1 = vcombine_u16(s[1], s[5]);
+    uint16x8_t s2 = vcombine_u16(s[2], s[6]);
+    uint16x8_t s3 = vcombine_u16(s[3], s[7]);
+    uint16x8_t r0 = vcombine_u16(r[0], r[4]);
+    uint16x8_t r1 = vcombine_u16(r[1], r[5]);
+    uint16x8_t r2 = vcombine_u16(r[2], r[6]);
+    uint16x8_t r3 = vcombine_u16(r[3], r[7]);
+
+    diff[0] = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    diff[1] = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
+    diff[2] = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+    diff[3] = vreinterpretq_s16_u16(vsubq_u16(r3, s3));
+
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
+
+    return vaddlvq_u16(out);
+}
 
-static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
-                            int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x16_t r0, r1, r2, r3;
-    uint8x16_t t0, t1, t2, t3;
-    int16x8_t v16, v17, v20, v21;
-    int16x8_t v18, v19, v22, v23;
+    int16x8_t diff[4];
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
 
-    r0 = vld1q_u8(pix1 + 0 * stride_pix1);
-    r1 = vld1q_u8(pix1 + 1 * stride_pix1);
-    r2 = vld1q_u8(pix1 + 2 * stride_pix1);
-    r3 = vld1q_u8(pix1 + 3 * stride_pix1);
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
 
-    t0 = vld1q_u8(pix2 + 0 * stride_pix2);
-    t1 = vld1q_u8(pix2 + 1 * stride_pix2);
-    t2 = vld1q_u8(pix2 + 2 * stride_pix2);
-    t3 = vld1q_u8(pix2 + 3 * stride_pix2);
+    return vaddlvq_u16(out);
+}
 
-    v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)));
-    v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)));
-    v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)));
-    v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)));
-    v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3));
+static inline int pixel_satd_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
 
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+    uint32x4_t res = vpaddlq_u16(out[0]);
+    res = vpadalq_u16(res, out[1]);
 
+    return vaddvq_u32(res);
 }
 
-
-static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
-                                int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
-                                int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int pixel_satd_8x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x8_t r0, r1, r2, r3;
-    uint8x8_t t0, t1, t2, t3;
-    int16x8_t v16, v17;
-    int16x8_t v18, v19;
+    int16x8_t diff[16];
+    uint16x8_t out[4];
+
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4_dual(pix1 + 8 * stride_pix1, stride_pix1,
+                           pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
 
-    r0 = vld1_u8(pix1 + 0 * stride_pix1);
-    r1 = vld1_u8(pix1 + 1 * stride_pix1);
-    r2 = vld1_u8(pix1 + 2 * stride_pix1);
-    r3 = vld1_u8(pix1 + 3 * stride_pix1);
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
-    t0 = vld1_u8(pix2 + 0 * stride_pix2);
-    t1 = vld1_u8(pix2 + 1 * stride_pix2);
-    t2 = vld1_u8(pix2 + 2 * stride_pix2);
-    t3 = vld1_u8(pix2 + 3 * stride_pix2);
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-    v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
-    v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
-    v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
-    v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+    uint32x4_t res = vpaddlq_u16(sum0);
+    res = vpadalq_u16(res, sum1);
 
-    r0 = vld1_u8(pix1 + 4 * stride_pix1);
-    r1 = vld1_u8(pix1 + 5 * stride_pix1);
-    r2 = vld1_u8(pix1 + 6 * stride_pix1);
-    r3 = vld1_u8(pix1 + 7 * stride_pix1);
+    return vaddvq_u32(res);
+}
 
-    t0 = vld1_u8(pix2 + 4 * stride_pix2);
-    t1 = vld1_u8(pix2 + 5 * stride_pix2);
-    t2 = vld1_u8(pix2 + 6 * stride_pix2);
-    t3 = vld1_u8(pix2 + 7 * stride_pix2);
+static inline int pixel_satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
 
-    v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
-    v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
-    v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
-    v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+    load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
+    uint16x8_t sum0, sum1;
+    hadamard_4x4_dual(diff, &sum0);
+    hadamard_4x4_dual(diff + 4, &sum1);
 
-    SUMSUB_AB(v0,  v1,  v16, v17);
-    SUMSUB_AB(v2,  v3,  v18, v19);
+    sum0 = vaddq_u16(sum0, sum1);
 
+    return vaddlvq_u16(sum0);
 }
 
-int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                       const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint8x8_t t0 = load_u8x4x2(pix1, 2 * stride_pix1);
-    uint8x8_t t1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
-    uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u16x8x4_dual(pix1 + 8, stride_pix1,  pix2 + 8, stride_pix2, diff + 8);
 
-    return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)),
-                          vreinterpretq_s16_u16(vsubl_u8(r1, t1)));
-}
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
+#if X265_DEPTH == 10
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
-{
-    uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
+    sum0 = vaddq_u16(sum0, sum1);
 
-    i0 = vld1_u8(pix1 + 0 * stride_pix1);
-    i1 = vld1_u8(pix2 + 0 * stride_pix2);
-    i2 = vld1_u8(pix1 + 1 * stride_pix1);
-    i3 = vld1_u8(pix2 + 1 * stride_pix2);
-    i4 = vld1_u8(pix1 + 2 * stride_pix1);
-    i5 = vld1_u8(pix2 + 2 * stride_pix2);
-    i6 = vld1_u8(pix1 + 3 * stride_pix1);
-    i7 = vld1_u8(pix2 + 3 * stride_pix2);
+    return vaddlvq_u16(sum0);
+#else // X265_DEPTH == 12
+    uint32x4_t sum0 = vpaddlq_u16(out[0]);
+    uint32x4_t sum1 = vpaddlq_u16(out[1]);
+    sum0 = vpadalq_u16(sum0, out[2]);
+    sum1 = vpadalq_u16(sum1, out[3]);
 
-    int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1));
-    int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3));
-    int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5));
-    int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7));
+    sum0 = vaddq_u32(sum0, sum1);
 
-    return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+    return vaddvq_u32(sum0);
+#endif // X265_DEPTH == 10
 }
 
-int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                        const uint16_t *pix2, intptr_t stride_pix2)
 {
-    uint16x8_t v30, v31;
-    int16x8_t v0, v1, v2, v3;
-    uint16x8_t t0, t1;
+    uint32x4_t sum[2]= { vdupq_n_u32(0), vdupq_n_u32(0) };
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
-    v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+    for (int i = 0; i < 4; ++i)
+    {
+        load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+        load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
-    _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        hadamard_4x4_quad(diff, out);
 
-    _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        sum[0] = vpadalq_u16(sum[0], out[0]);
+        sum[1] = vpadalq_u16(sum[1], out[1]);
 
-    _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
-    t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
-    v30 = vaddq_u16(v30, t0);
-    v31 = vaddq_u16(v31, t1);
+        pix1 += 4 * stride_pix1;
+        pix2 += 4 * stride_pix2;
+    }
 
-    uint32x4_t sum0 = vpaddlq_u16(v30);
-    uint32x4_t sum1 = vpaddlq_u16(v31);
-    sum0 = vaddq_u32(sum0, sum1);
-    return vaddvq_u32(sum0);
+    return vaddvq_u32(vaddq_u32(sum[0], sum[1]));
 }
-#endif      //HIGH_BIT_DEPTH
 
-#if HIGH_BIT_DEPTH
-typedef uint32x4_t sa8d_out_type;
-#else
-typedef uint16x8_t sa8d_out_type;
-#endif
-
-static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
-                                      int16x8_t v3, int16x8_t v20,
-                                      int16x8_t v21, int16x8_t v22,
-                                      int16x8_t v23, sa8d_out_type &out0,
-                                      sa8d_out_type &out1)
+static inline int pixel_sa8d_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                      const uint16_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v16, v17, v18, v19;
-    int16x8_t v4, v5, v6, v7;
+    int16x8_t diff[8];
+    uint32x4_t res[2];
 
-    SUMSUB_AB(v16, v18, v0,  v2);
-    SUMSUB_AB(v17, v19, v1,  v3);
+    load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
 
-    HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
+    uint32x4_t s = vaddq_u32(res[0], res[1]);
 
-    SUMSUB_AB(v0,  v16, v16, v20);
-    SUMSUB_AB(v1,  v17, v17, v21);
-    SUMSUB_AB(v2,  v18, v18, v22);
-    SUMSUB_AB(v3,  v19, v19, v23);
-
-    transpose_8h_8h(v20, v21, v16, v17);
-    transpose_8h_8h(v4,  v5,  v0,  v1);
-    transpose_8h_8h(v22, v23, v18, v19);
-    transpose_8h_8h(v6,  v7,  v2,  v3);
+    return (vaddvq_u32(s) + 1) >> 1;
+}
 
-#if (X265_DEPTH <= 10)
+static inline int pixel_sa8d_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+                                        const uint16_t *pix2, intptr_t stride_pix2)
+{
+    uint32x4_t sum0, sum1;
 
-    int16x8_t v24, v25;
+    int16x8_t diff[8];
+    uint32x4_t res[2];
 
-    SUMSUB_AB(v2,  v3,  v20, v21);
-    SUMSUB_AB(v24, v25, v4,  v5);
-    SUMSUB_AB(v0,  v1,  v22, v23);
-    SUMSUB_AB(v4,  v5,  v6,  v7);
+    load_diff_u16x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(res[0], res[1]);
 
-    transpose_4s_8h(v20, v22, v2,  v0);
-    transpose_4s_8h(v21, v23, v3,  v1);
-    transpose_4s_8h(v16, v18, v24, v4);
-    transpose_4s_8h(v17, v19, v25, v5);
+    load_diff_u16x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum1 = vaddq_u32(res[0], res[1]);
 
-    SUMSUB_AB(v0,  v2,  v20, v22);
-    SUMSUB_AB(v1,  v3,  v21, v23);
-    SUMSUB_AB(v4,  v6,  v16, v18);
-    SUMSUB_AB(v5,  v7,  v17, v19);
+    load_diff_u16x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                      pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(sum0, res[0]);
+    sum1 = vaddq_u32(sum1, res[1]);
 
-    transpose_2d_8h(v16, v20,  v0,  v4);
-    transpose_2d_8h(v17, v21,  v1,  v5);
-    transpose_2d_8h(v18, v22,  v2,  v6);
-    transpose_2d_8h(v19, v23,  v3,  v7);
+    load_diff_u16x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+                      pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vaddq_u32(sum0, res[0]);
+    sum1 = vaddq_u32(sum1, res[1]);
 
-    uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16));
-    uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17));
-    uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18));
-    uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19));
-    uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20));
-    uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21));
-    uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22));
-    uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23));
+    sum0 = vaddq_u32(sum0, sum1);
 
-    uint16x8_t max0 = vmaxq_u16(abs0, abs4);
-    uint16x8_t max1 = vmaxq_u16(abs1, abs5);
-    uint16x8_t max2 = vmaxq_u16(abs2, abs6);
-    uint16x8_t max3 = vmaxq_u16(abs3, abs7);
+    return (vaddvq_u32(sum0) + 1) >> 1;
+}
 
-#if HIGH_BIT_DEPTH
-    out0 = vpaddlq_u16(max0);
-    out1 = vpaddlq_u16(max1);
-    out0 = vpadalq_u16(out0, max2);
-    out1 = vpadalq_u16(out1, max3);
-
-#else //HIGH_BIT_DEPTH
-
-    out0 = vaddq_u16(max0, max1);
-    out1 = vaddq_u16(max2, max3);
-
-#endif //HIGH_BIT_DEPTH
-
-#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
-
-    int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
-    int32x4_t v22l, v22h, v23l, v23h;
-    int32x4_t v4l, v4h, v5l, v5h;
-    int32x4_t v6l, v6h, v7l, v7h;
-    int32x4_t v16l, v16h, v17l, v17h;
-    int32x4_t v18l, v18h, v19l, v19h;
-    int32x4_t v20l, v20h, v21l, v21h;
-
-    ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
-    ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
-
-    v22l = vmovl_s16(vget_low_s16(v22));
-    v22h = vmovl_high_s16(v22);
-    v23l = vmovl_s16(vget_low_s16(v23));
-    v23h = vmovl_high_s16(v23);
-
-    ISUMSUB_AB(v0l,  v1l,  v22l, v23l);
-    ISUMSUB_AB(v0h,  v1h,  v22h, v23h);
-
-    v6l = vmovl_s16(vget_low_s16(v6));
-    v6h = vmovl_high_s16(v6);
-    v7l = vmovl_s16(vget_low_s16(v7));
-    v7h = vmovl_high_s16(v7);
-
-    ISUMSUB_AB(v4l,  v5l,  v6l,  v7l);
-    ISUMSUB_AB(v4h,  v5h,  v6h,  v7h);
-
-    transpose_2d_4s(v20l, v22l, v2l,  v0l);
-    transpose_2d_4s(v21l, v23l, v3l,  v1l);
-    transpose_2d_4s(v16l, v18l, v24l, v4l);
-    transpose_2d_4s(v17l, v19l, v25l, v5l);
-
-    transpose_2d_4s(v20h, v22h, v2h,  v0h);
-    transpose_2d_4s(v21h, v23h, v3h,  v1h);
-    transpose_2d_4s(v16h, v18h, v24h, v4h);
-    transpose_2d_4s(v17h, v19h, v25h, v5h);
-
-    ISUMSUB_AB(v0l,  v2l,  v20l, v22l);
-    ISUMSUB_AB(v1l,  v3l,  v21l, v23l);
-    ISUMSUB_AB(v4l,  v6l,  v16l, v18l);
-    ISUMSUB_AB(v5l,  v7l,  v17l, v19l);
-
-    ISUMSUB_AB(v0h,  v2h,  v20h, v22h);
-    ISUMSUB_AB(v1h,  v3h,  v21h, v23h);
-    ISUMSUB_AB(v4h,  v6h,  v16h, v18h);
-    ISUMSUB_AB(v5h,  v7h,  v17h, v19h);
-
-    v16l = v0l;
-    v16h = v4l;
-    v20l = v0h;
-    v20h = v4h;
-
-    v17l = v1l;
-    v17h = v5l;
-    v21l = v1h;
-    v21h = v5h;
-
-    v18l = v2l;
-    v18h = v6l;
-    v22l = v2h;
-    v22h = v6h;
-
-    v19l = v3l;
-    v19h = v7l;
-    v23l = v3h;
-    v23h = v7h;
-
-    uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l));
-    uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l));
-    uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l));
-    uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l));
-    uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l));
-    uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l));
-    uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l));
-    uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l));
-
-    uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h));
-    uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h));
-    uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h));
-    uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h));
-    uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h));
-    uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h));
-    uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h));
-    uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h));
-
-    uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo);
-    uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo);
-    uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo);
-    uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo);
-
-    uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi);
-    uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi);
-    uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi);
-    uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi);
-
-    uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi);
-    uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi);
-    uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi);
-    uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi);
-
-    out0 = vaddq_u32(sum0, sum1);
-    out1 = vaddq_u32(sum2, sum3);
+#else // !HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    uint8x8_t s0 = load_u8x4x2(pix1, 2 * stride_pix1);
+    uint8x8_t s1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
 
+    uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
+    uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
 
-#endif
+    int16x8_t diff0 = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+    int16x8_t diff1 = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
 
+    return hadamard_4x4(diff0, diff1);
 }
 
-
-
-static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
-                                  int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_4x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
 {
+    int16x8_t diff[4];
+
+    uint8x8_t s0 = load_u8x4x2(pix1 + 0 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s1 = load_u8x4x2(pix1 + 1 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s2 = load_u8x4x2(pix1 + 2 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t s3 = load_u8x4x2(pix1 + 3 * stride_pix1, 4 * stride_pix1);
+    uint8x8_t r0 = load_u8x4x2(pix2 + 0 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r1 = load_u8x4x2(pix2 + 1 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r2 = load_u8x4x2(pix2 + 2 * stride_pix2, 4 * stride_pix2);
+    uint8x8_t r3 = load_u8x4x2(pix2 + 3 * stride_pix2, 4 * stride_pix2);
+
+    diff[0] = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+    diff[1] = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+    diff[2] = vreinterpretq_s16_u16(vsubl_u8(s2, r2));
+    diff[3] = vreinterpretq_s16_u16(vsubl_u8(r3, s3));
+
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
+
+    return vaddlvq_u16(out);
+}
 
-    int16x8_t v20, v21, v22, v23;
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+static inline int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[4];
 
-}
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
 
+    uint16x8_t out;
+    hadamard_4x4_dual(diff, &out);
 
+    return vaddlvq_u16(out);
+}
 
-int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
+    int16x8_t diff[8];
+    uint16x8_t out[2];
 
-    _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
-    uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
-    uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
 
-#if !(HIGH_BIT_DEPTH)
-    uint16x8_t sum = vaddq_u16(v30, v31);
-    return vaddvq_u32(vpaddlq_u16(sum));
-#else
-    uint32x4_t sum = vpaddlq_u16(v30);
-    sum = vpadalq_u16(sum, v31);
-    return vaddvq_u32(sum);
-#endif
-}
+    out[0] = vaddq_u16(out[0], out[1]);
 
+    return vaddlvq_u16(out[0]);
+}
 
-int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
-    int16x8_t v20, v21, v22, v23;
-    sa8d_out_type res0, res1;
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
-
-#if HIGH_BIT_DEPTH
-    uint32x4_t s = vaddq_u32(res0, res1);
-    return (vaddvq_u32(s) + 1) >> 1;
-#else
-    return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1;
-#endif
-}
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                     pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
 
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
+    sum0 = vaddq_u16(sum0, sum1);
 
+    return vaddlvq_u16(sum0);
+}
 
-int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
 {
-    int16x8_t v0, v1, v2, v3;
-    int16x8_t v20, v21, v22, v23;
-    sa8d_out_type res0, res1;
-    uint32x4_t v30, v31;
+    int16x8_t diff[8];
 
-    _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpaddlq_u16(res0);
-    v31 = vpaddlq_u16(res1);
-#else
-    v30 = vaddq_u32(res0, res1);
-#endif
+    uint16x8_t out[2];
+    hadamard_4x4_dual(diff, &out[0]);
+    hadamard_4x4_dual(diff + 4, &out[1]);
 
-    _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    out[0] = vaddq_u16(out[0], out[1]);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v31 = vaddq_u32(res0, res1);
-#endif
+    return vaddlvq_u16(out[0]);
+}
 
+static inline int pixel_satd_16x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                       const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[16];
+    uint16x8_t out[4];
 
-    _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
-                 v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    load_diff_u8x8x8(pix1 + 8, stride_pix1,  pix2 + 8, stride_pix2, diff + 8);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v30 = vaddq_u32(v30, res0);
-    v31 = vaddq_u32(v31, res1);
-#endif
+    hadamard_4x4_quad(diff, out);
+    hadamard_4x4_quad(diff + 8, out + 2);
 
-    _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
-                 v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+    uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+    uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
 
-#if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, res0);
-    v31 = vpadalq_u16(v31, res1);
-#else
-    v30 = vaddq_u32(v30, res0);
-    v31 = vaddq_u32(v31, res1);
-#endif
+    sum0 = vaddq_u16(sum0, sum1);
+
+    return vaddlvq_u16(sum0);
+}
+
+static inline int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                        const uint8_t *pix2, intptr_t stride_pix2)
+{
+    uint16x8_t sum[2], out[2];
+    int16x8_t diff[8];
+
+    load_diff_u8x16x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = out[0];
+    sum[1] = out[1];
+
+    load_diff_u8x16x4(pix1 + 4 * stride_pix1, stride_pix1,
+                      pix2 + 4 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    load_diff_u8x16x4(pix1 + 8 * stride_pix1, stride_pix1,
+                      pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    load_diff_u8x16x4(pix1 + 12 * stride_pix1, stride_pix1,
+                      pix2 + 12 * stride_pix2, stride_pix2, diff);
+    hadamard_4x4_quad(diff, out);
+    sum[0] = vaddq_u16(sum[0], out[0]);
+    sum[1] = vaddq_u16(sum[1], out[1]);
+
+    uint32x4_t sum0 = vpaddlq_u16(sum[0]);
+    uint32x4_t sum1 = vpaddlq_u16(sum[1]);
 
-    v30 = vaddq_u32(v30, v31);
+    sum0 = vaddq_u32(sum0, sum1);
 
-    return (vaddvq_u32(v30) + 1) >> 1;
+    return vaddvq_u32(sum0);
 }
 
+static inline int pixel_sa8d_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                      const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t res[2];
 
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
 
+    return (vaddlvq_u16(vaddq_u16(res[0], res[1])) + 1) >> 1;
+}
 
+static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+                                        const uint8_t *pix2, intptr_t stride_pix2)
+{
+    int16x8_t diff[8];
+    uint16x8_t res[2];
+    uint32x4_t sum0, sum1;
+
+    load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpaddlq_u16(res[0]);
+    sum1 = vpaddlq_u16(res[1]);
+
+    load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
+
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+                     pix2 + 8 * stride_pix2, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
+
+    load_diff_u8x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+                     pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+    hadamard_8x8(diff, res);
+    sum0 = vpadalq_u16(sum0, res[0]);
+    sum1 = vpadalq_u16(sum1, res[1]);
 
+    sum0 = vaddq_u32(sum0, sum1);
 
+    return (vaddvq_u32(sum0) + 1) >> 1;
+}
 
+#endif // HIGH_BIT_DEPTH
 
 template<int size>
 void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
@@ -1425,7 +1431,7 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
 
 template<int w, int h>
 // Calculate sa8d in blocks of 8x8
-int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d8_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
@@ -1440,7 +1446,7 @@ int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2
 
 template<int w, int h>
 // Calculate sa8d in blocks of 16x16
-int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d16_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
 {
     int cost = 0;
 
@@ -1474,42 +1480,63 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
 
 
 template<int w, int h>
-// calculate satd in blocks of 4x4
 int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
-    for (int row = 0; row < h; row += 4)
-        for (int col = 0; col < w; col += 4)
-            satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
-                                        pix2 + row * stride_pix2 + col, stride_pix2);
+    if (w == 4 && h == 4) {
+        satd = pixel_satd_4x4_neon(pix1, stride_pix1, pix2, stride_pix2);
+    } else {
+        for (int row = 0; row < h; row += 8)
+            for (int col = 0; col < w; col += 4)
+                satd += pixel_satd_4x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                            pix2 + row * stride_pix2 + col, stride_pix2);
+    }
 
     return satd;
 }
 
 template<int w, int h>
-// calculate satd in blocks of 8x4
 int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
     int satd = 0;
 
-    if (((w | h) & 15) == 0)
+    if (w % 16 == 0 && h % 16 == 0)
     {
         for (int row = 0; row < h; row += 16)
             for (int col = 0; col < w; col += 16)
                 satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
                                               pix2 + row * stride_pix2 + col, stride_pix2);
-
     }
-    else if (((w | h) & 7) == 0)
+    else if (w % 8 == 0 && h % 16 == 0)
+    {
+        for (int row = 0; row < h; row += 16)
+            for (int col = 0; col < w; col += 8)
+                satd += pixel_satd_8x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 16 == 0 && h % 8 == 0)
+    {
+        for (int row = 0; row < h; row += 8)
+            for (int col = 0; col < w; col += 16)
+                satd += pixel_satd_16x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 16 == 0 && h % 4 == 0)
+    {
+        for (int row = 0; row < h; row += 4)
+            for (int col = 0; col < w; col += 16)
+                satd += pixel_satd_16x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+                                             pix2 + row * stride_pix2 + col, stride_pix2);
+    }
+    else if (w % 8 == 0 && h % 8 == 0)
     {
         for (int row = 0; row < h; row += 8)
             for (int col = 0; col < w; col += 8)
                 satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
                                             pix2 + row * stride_pix2 + col, stride_pix2);
-
     }
-    else
+    else // w multiple of 8, h multiple of 4
     {
         for (int row = 0; row < h; row += 4)
             for (int col = 0; col < w; col += 8)
@@ -1634,38 +1661,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_PU(64, 16);
     LUMA_PU(16, 64);
 
-    p.pu[LUMA_4x4].satd   = pixel_satd_4x4_neon;
-    p.pu[LUMA_8x4].satd   = pixel_satd_8x4_neon;
-    
+    p.pu[LUMA_4x4].satd   = satd4_neon<4, 4>;
+    p.pu[LUMA_4x8].satd   = satd4_neon<4, 8>;
+    p.pu[LUMA_4x16].satd  = satd4_neon<4, 16>;
+    p.pu[LUMA_8x4].satd   = satd8_neon<8, 4>;
     p.pu[LUMA_8x8].satd   = satd8_neon<8, 8>;
-    p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
-    p.pu[LUMA_16x8].satd  = satd8_neon<16, 8>;
     p.pu[LUMA_8x16].satd  = satd8_neon<8, 16>;
-    p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+    p.pu[LUMA_8x32].satd  = satd8_neon<8, 32>;
+    p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
     p.pu[LUMA_16x4].satd  = satd8_neon<16, 4>;
-    p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
-    p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+    p.pu[LUMA_16x8].satd  = satd8_neon<16, 8>;
+    p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+    p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
     p.pu[LUMA_16x32].satd = satd8_neon<16, 32>;
-    p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+    p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
     p.pu[LUMA_24x32].satd = satd8_neon<24, 32>;
     p.pu[LUMA_32x8].satd  = satd8_neon<32, 8>;
-    p.pu[LUMA_8x32].satd  = satd8_neon<8, 32>;
-    p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
-    p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+    p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+    p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+    p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
     p.pu[LUMA_32x64].satd = satd8_neon<32, 64>;
-    p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
     p.pu[LUMA_48x64].satd = satd8_neon<48, 64>;
     p.pu[LUMA_64x16].satd = satd8_neon<64, 16>;
-    p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
-
-#if HIGH_BIT_DEPTH
-    p.pu[LUMA_4x8].satd   = satd4_neon<4, 8>;
-    p.pu[LUMA_4x16].satd  = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
-    p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+    p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+    p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
+    p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
 
 
     LUMA_CU(4, 4);
@@ -1673,7 +1693,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     LUMA_CU(16, 16);
     LUMA_CU(32, 32);
     LUMA_CU(64, 64);
-    
+
 #if !(HIGH_BIT_DEPTH)
     p.cu[BLOCK_8x8].var   = pixel_var_neon<8>;
     p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
@@ -1697,17 +1717,17 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_8x8].calcresidual[ALIGNED]       = getResidual_neon<8>;
     p.cu[BLOCK_16x16].calcresidual[NONALIGNED]  = getResidual_neon<16>;
     p.cu[BLOCK_16x16].calcresidual[ALIGNED]     = getResidual_neon<16>;
-    
+
 #if defined(__APPLE__)
     p.cu[BLOCK_32x32].calcresidual[NONALIGNED]  = getResidual_neon<32>;
     p.cu[BLOCK_32x32].calcresidual[ALIGNED]     = getResidual_neon<32>;
 #endif // defined(__APPLE__)
 
-    p.cu[BLOCK_4x4].sa8d   = pixel_satd_4x4_neon;
-    p.cu[BLOCK_8x8].sa8d   = pixel_sa8d_8x8_neon;
-    p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
-    p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
-    p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
+    p.cu[BLOCK_4x4].sa8d   = satd4_neon<4, 4>;
+    p.cu[BLOCK_8x8].sa8d   = sa8d8_neon<8, 8>;
+    p.cu[BLOCK_16x16].sa8d = sa8d16_neon<16, 16>;
+    p.cu[BLOCK_32x32].sa8d = sa8d16_neon<32, 32>;
+    p.cu[BLOCK_64x64].sa8d = sa8d16_neon<64, 64>;
 
 
 #define CHROMA_PU_420(W, H) \
@@ -1743,38 +1763,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = pixel_satd_4x4_neon;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd8_neon<8, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd   = NULL;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = pixel_satd_8x4_neon;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd8_neon<16, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd8_neon<8, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
-
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd   = NULL;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd   = NULL;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4_neon<16, 12>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd4_neon<16, 4>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd8_neon<32, 8>;
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd8_neon<8, 32>;
-    
-#if HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = satd4_neon<4, 4>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = satd4_neon<4, 8>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = satd8_neon<8, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd   = NULL;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd8_neon<8, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd8_neon<8, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd8_neon<8, 32>;
     p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd8_neon<16, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd8_neon<16, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd8_neon<16, 12>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd8_neon<32, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
 
 
 #define CHROMA_CU_420(W, H) \
@@ -1783,7 +1795,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>;  \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-    
+
 #define CHROMA_CU_S_420(W, H) \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1799,9 +1811,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
-    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
-    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
-    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 8>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 16>;
+    p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 32>;
 
 
 #define CHROMA_PU_422(W, H) \
@@ -1837,34 +1849,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
 
 
     p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd   = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd8_neon<8, 16>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = pixel_satd_4x4_neon;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd   = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = satd4_neon<4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd4_neon<4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd4_neon<4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd4_neon<4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd  = NULL;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd8_neon<8, 4>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = satd8_neon<8, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd8_neon<8, 12>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd8_neon<8, 16>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = satd8_neon<8, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd  = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd4_neon<8, 4>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd  = NULL;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd8_neon<16, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
-    
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd4_neon<8, 12>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = satd8_neon<8, 64>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd8_neon<16, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8_neon<16, 24>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8_neon<24, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8_neon<32, 48>;
-
-#if HIGH_BIT_DEPTH
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd4_neon<4, 8>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd4_neon<4, 16>;
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd4_neon<4, 32>;
-#endif // HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
 
 
 #define CHROMA_CU_422(W, H) \
@@ -1887,10 +1896,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     CHROMA_CU_422(16, 32)
     CHROMA_CU_422(32, 64)
 
-    p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d   = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
-    p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
-    p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
-    p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d       = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
+    p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d     = sa8d8_neon<8, 16>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d     = sa8d16_neon<16, 32>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d     = sa8d16_neon<32, 64>;
+
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d  = sa8d8_neon<8, 16>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = sa8d16_neon<16, 32>;
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = sa8d16_neon<32, 64>;
 
     p.weight_pp = weight_pp_neon;
 
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 106ba903a..856c12862 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -56,261 +56,3 @@ function PFX(pixel_sub_ps_8x16_sve)
     ret
 endfunc
 
-//******* satd *******
-.macro satd_4x4_sve
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z2.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z1.h}, p0/z, [x0]
-    ld1b            {z3.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z6.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z5.h}, p0/z, [x0]
-    ld1b            {z7.h}, p0/z, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-
-    sub             z0.h, z0.h, z2.h
-    sub             z1.h, z1.h, z3.h
-    sub             z2.h, z4.h, z6.h
-    sub             z3.h, z5.h, z7.h
-
-    add             z4.h, z0.h, z2.h
-    add             z5.h, z1.h, z3.h
-    sub             z6.h, z0.h, z2.h
-    sub             z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.h, z0.h, z2.h
-    trn2            z5.h, z0.h, z2.h
-
-    trn1            z6.h, z1.h, z3.h
-    trn2            z7.h, z1.h, z3.h
-
-    add             z0.h, z4.h, z5.h
-    sub             z1.h, z4.h, z5.h
-
-    add             z2.h, z6.h, z7.h
-    sub             z3.h, z6.h, z7.h
-
-    trn1            z4.s, z0.s, z1.s
-    trn2            z5.s, z0.s, z1.s
-
-    trn1            z6.s, z2.s, z3.s
-    trn2            z7.s, z2.s, z3.s
-
-    abs             z4.h, p0/m, z4.h
-    abs             z5.h, p0/m, z5.h
-    abs             z6.h, p0/m, z6.h
-    abs             z7.h, p0/m, z7.h
-
-    smax            z4.h, p0/m, z4.h, z5.h
-    smax            z6.h, p0/m, z6.h, z7.h
-
-    add             z0.h, z4.h, z6.h
-
-    uaddlp          v0.2s, v0.4h
-    uaddlp          v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_sve)
-    ptrue           p0.h, vl4
-    satd_4x4_sve
-    fmov            x0, d0
-    ret
-endfunc
-
-function PFX(pixel_satd_8x4_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    add             x0, x4, #4
-    add             x2, x5, #4
-    umov            x6, v0.d[0]
-    satd_4x4_sve
-    umov            x0, v0.d[0]
-    add             x0, x0, x6
-    ret
-endfunc
-
-function PFX(pixel_satd_8x12_sve)
-    ptrue           p0.h, vl4
-    mov             x4, x0
-    mov             x5, x2
-    mov             x7, #0
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.rept 2
-    sub             x0, x0, #4
-    sub             x2, x2, #4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_sve
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.endr
-    mov             x0, x7
-    ret
-endfunc
-
-.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
-    mov             x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z1.h}, p0/z, [x0, x11]
-    ld1b            {z2.h}, p0/z, [x2]
-    ld1b            {z3.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z5.h}, p0/z, [x0, x11]
-    ld1b            {z6.h}, p0/z, [x2]
-    ld1b            {z7.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v0\().h, z0.h, z2.h
-    sub             \v4\().h, z1.h, z3.h
-    sub             \v1\().h, z4.h, z6.h
-    sub             \v5\().h, z5.h, z7.h
-
-    ld1b            {z0.h}, p0/z, [x0]
-    ld1b            {z1.h}, p0/z, [x0, x11]
-    ld1b            {z2.h}, p0/z, [x2]
-    ld1b            {z3.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z4.h}, p0/z, [x0]
-    ld1b            {z5.h}, p0/z, [x0, x11]
-    ld1b            {z6.h}, p0/z, [x2]
-    ld1b            {z7.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             \v2\().h, z0.h, z2.h
-    sub             \v6\().h, z1.h, z3.h
-    sub             \v3\().h, z4.h, z6.h
-    sub             \v7\().h, z5.h, z7.h
-.endm
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_sve), export=0
-    HADAMARD4_V     z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
-    HADAMARD4_V     z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
-    trn4            z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
-    trn4            z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
-    SUMSUB_ABCD     z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
-    SUMSUB_ABCD     z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
-    trn4            z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
-    trn4            z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
-    ABS8_SVE        z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
-    smax            z0.h, p0/m, z0.h, z2.h
-    smax            z1.h, p0/m, z1.h, z3.h
-    smax            z4.h, p0/m, z4.h, z6.h
-    smax            z5.h, p0/m, z5.h, z7.h
-    ret
-endfunc
-
-function PFX(satd_16x4_sve), export=0
-    LOAD_DIFF_16x4_sve  z16, z17, z18, z19, z20, z21, z22, z23
-    b                    PFX(satd_8x4v_8x8h_sve)
-endfunc
-
-.macro pixel_satd_32x8_sve
-    mov             x4, x0
-    mov             x5, x2
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h
-    add             z30.h, z30.h, z4.h
-    add             z31.h, z31.h, z5.h
-.endr
-    add             x0, x4, #16
-    add             x2, x5, #16
-.rept 2
-    bl              PFX(satd_16x4_sve)
-    add             z30.h, z30.h, z0.h
-    add             z31.h, z31.h, z1.h
-    add             z30.h, z30.h, z4.h
-    add             z31.h, z31.h, z5.h
-.endr
-.endm
-
-.macro satd_32x16_sve
-    movi            v30.2d, #0
-    movi            v31.2d, #0
-    pixel_satd_32x8_sve
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8_sve
-    add             z0.h, z30.h, z31.h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-.endm
-
-function PFX(pixel_satd_32x16_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    satd_32x16_sve
-    mov             x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x32_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    mov             x7, #0
-    satd_32x16_sve
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-    satd_32x16_sve
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-.macro satd_64x16_sve
-    mov             x8, x0
-    mov             x9, x2
-    satd_32x16_sve
-    add             x7, x7, x6
-    add             x0, x8, #32
-    add             x2, x9, #32
-    satd_32x16_sve
-    add             x7, x7, x6
-.endm
-
-function PFX(pixel_satd_64x48_sve)
-    ptrue           p0.h, vl8
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_64x16_sve
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_sve
-    mov             x0, x7
-    ret             x10
-endfunc
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..e189fdcd7 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,963 +565,6 @@ function PFX(scale2D_64to32_neon)
     ret
 endfunc
 
-//******* satd *******
-.macro satd_4x4_neon
-    ldr             s0, [x0]
-    ldr             s1, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1             {v0.s}[1], [x0], x1
-    ld1             {v1.s}[1], [x2], x3
-
-    ldr             s2, [x0]
-    ldr             s3, [x2]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1             {v2.s}[1], [x0], x1
-    ld1             {v3.s}[1], [x2], x3
-
-    usubl           v4.8h, v0.8b, v1.8b
-    usubl           v5.8h, v2.8b, v3.8b
-
-    add             v6.8h, v4.8h, v5.8h
-    sub             v7.8h, v4.8h, v5.8h
-
-    mov             v4.d[0], v6.d[1]
-    add             v0.4h, v6.4h, v4.4h
-    sub             v2.4h, v6.4h, v4.4h
-
-    mov             v5.d[0], v7.d[1]
-    add             v1.4h, v7.4h, v5.4h
-    sub             v3.4h, v7.4h, v5.4h
-
-    trn1            v4.4h, v0.4h, v1.4h
-    trn2            v5.4h, v0.4h, v1.4h
-
-    trn1            v6.4h, v2.4h, v3.4h
-    trn2            v7.4h, v2.4h, v3.4h
-
-    add             v0.4h, v4.4h, v5.4h
-    sub             v1.4h, v4.4h, v5.4h
-
-    add             v2.4h, v6.4h, v7.4h
-    sub             v3.4h, v6.4h, v7.4h
-
-    trn1            v4.2s, v0.2s, v1.2s
-    trn2            v5.2s, v0.2s, v1.2s
-
-    trn1            v6.2s, v2.2s, v3.2s
-    trn2            v7.2s, v2.2s, v3.2s
-
-    abs             v4.4h, v4.4h
-    abs             v5.4h, v5.4h
-    abs             v6.4h, v6.4h
-    abs             v7.4h, v7.4h
-
-    smax            v1.4h, v4.4h, v5.4h
-    smax            v2.4h, v6.4h, v7.4h
-
-    add             v0.4h, v1.4h, v2.4h
-    uaddlp          v0.2s, v0.4h
-    uaddlp          v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_neon)
-    satd_4x4_neon
-    fmov            x0, d0
-    ret
-endfunc
-
-.macro x265_satd_4x8_8x4_end_neon
-    add             v0.8h, v4.8h, v6.8h
-    add             v1.8h, v5.8h, v7.8h
-    sub             v2.8h, v4.8h, v6.8h
-    sub             v3.8h, v5.8h, v7.8h
-
-    trn1            v16.8h, v0.8h, v1.8h
-    trn2            v17.8h, v0.8h, v1.8h
-    add             v4.8h, v16.8h, v17.8h
-    trn1            v18.8h, v2.8h, v3.8h
-    trn2            v19.8h, v2.8h, v3.8h
-    sub             v5.8h, v16.8h, v17.8h
-    add             v6.8h, v18.8h, v19.8h
-    sub             v7.8h, v18.8h, v19.8h
-    trn1            v0.4s, v4.4s, v6.4s
-    trn2            v2.4s, v4.4s, v6.4s
-    abs             v0.8h, v0.8h
-    trn1            v1.4s, v5.4s, v7.4s
-    trn2            v3.4s, v5.4s, v7.4s
-    abs             v2.8h, v2.8h
-    abs             v1.8h, v1.8h
-    abs             v3.8h, v3.8h
-    umax            v0.8h, v0.8h, v2.8h
-    umax            v1.8h, v1.8h, v3.8h
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-.endm
-
-.macro pixel_satd_4x8_neon
-    ld1r            {v1.2s}, [x2], x3
-    ld1r            {v0.2s}, [x0], x1
-    ld1r            {v3.2s}, [x2], x3
-    ld1r            {v2.2s}, [x0], x1
-    ld1r            {v5.2s}, [x2], x3
-    ld1r            {v4.2s}, [x0], x1
-    ld1r            {v7.2s}, [x2], x3
-    ld1r            {v6.2s}, [x0], x1
-
-    ld1             {v1.s}[1], [x2], x3
-    ld1             {v0.s}[1], [x0], x1
-    usubl           v0.8h, v0.8b, v1.8b
-    ld1             {v3.s}[1], [x2], x3
-    ld1             {v2.s}[1], [x0], x1
-    usubl           v1.8h, v2.8b, v3.8b
-    ld1             {v5.s}[1], [x2], x3
-    ld1             {v4.s}[1], [x0], x1
-    usubl           v2.8h, v4.8b, v5.8b
-    ld1             {v7.s}[1], [x2], x3
-    add             v4.8h, v0.8h, v1.8h
-    sub             v5.8h, v0.8h, v1.8h
-    ld1             {v6.s}[1], [x0], x1
-    usubl           v3.8h, v6.8b, v7.8b
-    add             v6.8h, v2.8h, v3.8h
-    sub             v7.8h, v2.8h, v3.8h
-    x265_satd_4x8_8x4_end_neon
-.endm
-
-// template<int w, int h>
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x8_neon)
-    pixel_satd_4x8_neon
-    mov             w0, v0.s[0]
-    ret
-endfunc
-
-function PFX(pixel_satd_4x16_neon)
-    mov             w4, #0
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w4, w4, w5
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w0, w5, w4
-    ret
-endfunc
-
-function PFX(pixel_satd_4x32_neon)
-    mov             w4, #0
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w5, v0.s[0]
-    add             w4, w4, w5
-.endr
-    mov             w0, w4
-    ret
-endfunc
-
-function PFX(pixel_satd_12x16_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             w7, #0
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-
-    add             x0, x4, #4
-    add             x2, x5, #4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-
-    add             x0, x4, #8
-    add             x2, x5, #8
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w0, w7, w6
-    ret
-endfunc
-
-function PFX(pixel_satd_12x32_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             w7, #0
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    add             x0, x4, #4
-    add             x2, x5, #4
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    add             x0, x4, #8
-    add             x2, x5, #8
-.rept 4
-    pixel_satd_4x8_neon
-    mov             w6, v0.s[0]
-    add             w7, w7, w6
-.endr
-
-    mov             w0, w7
-    ret
-endfunc
-
-function PFX(pixel_satd_8x4_neon)
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_neon
-    add             x0, x4, #4
-    add             x2, x5, #4
-    umov            x6, v0.d[0]
-    satd_4x4_neon
-    umov            x0, v0.d[0]
-    add             x0, x0, x6
-    ret
-endfunc
-
-.macro LOAD_DIFF_8x4 v0 v1 v2 v3
-    ld1             {v0.8b}, [x0], x1
-    ld1             {v1.8b}, [x2], x3
-    ld1             {v2.8b}, [x0], x1
-    ld1             {v3.8b}, [x2], x3
-    ld1             {v4.8b}, [x0], x1
-    ld1             {v5.8b}, [x2], x3
-    ld1             {v6.8b}, [x0], x1
-    ld1             {v7.8b}, [x2], x3
-    usubl           \v0, v0.8b, v1.8b
-    usubl           \v1, v2.8b, v3.8b
-    usubl           \v2, v4.8b, v5.8b
-    usubl           \v3, v6.8b, v7.8b
-.endm
-
-.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
-    ld1             {v0.16b}, [x0], x1
-    ld1             {v1.16b}, [x2], x3
-    ld1             {v2.16b}, [x0], x1
-    ld1             {v3.16b}, [x2], x3
-    ld1             {v4.16b}, [x0], x1
-    ld1             {v5.16b}, [x2], x3
-    ld1             {v6.16b}, [x0], x1
-    ld1             {v7.16b}, [x2], x3
-    usubl           \v0, v0.8b, v1.8b
-    usubl           \v1, v2.8b, v3.8b
-    usubl           \v2, v4.8b, v5.8b
-    usubl           \v3, v6.8b, v7.8b
-    usubl2          \v4, v0.16b, v1.16b
-    usubl2          \v5, v2.16b, v3.16b
-    usubl2          \v6, v4.16b, v5.16b
-    usubl2          \v7, v6.16b, v7.16b
-.endm
-
-function PFX(satd_16x4_neon), export=0
-    LOAD_DIFF_16x4  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    b               PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-function PFX(satd_8x8_neon), export=0
-    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
-    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
-    b               PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_neon), export=0
-    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
-    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    trn4            v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
-    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    SUMSUB_ABCD     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    SUMSUB_ABCD     v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
-    trn4            v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
-    trn4            v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
-    ABS8            v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
-    smax            v0.8h, v0.8h, v2.8h
-    smax            v1.8h, v1.8h, v3.8h
-    smax            v2.8h, v4.8h, v6.8h
-    smax            v3.8h, v5.8h, v7.8h
-    ret
-endfunc
-
-function PFX(pixel_satd_8x8_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    add             v1.8h, v2.8h, v3.8h
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x12_neon)
-    mov             x4, x0
-    mov             x5, x2
-    mov             x7, #0
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.rept 2
-    sub             x0, x0, #4
-    sub             x2, x2, #4
-    mov             x4, x0
-    mov             x5, x2
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-    add             x0, x4, #4
-    add             x2, x5, #4
-    satd_4x4_neon
-    umov            x6, v0.d[0]
-    add             x7, x7, x6
-.endr
-    mov             x0, x7
-    ret
-endfunc
-
-function PFX(pixel_satd_8x16_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x32_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 3
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_8x64_neon)
-    mov             x10, x30
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 7
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x4_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x8_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x12_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x16_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 3
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x24_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 5
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-.macro pixel_satd_16x32_neon
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 7
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-function PFX(pixel_satd_16x32_neon)
-    mov             x10, x30
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_16x64_neon)
-    mov             x10, x30
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v0.8h, v1.8h
-    add             v31.8h, v2.8h, v3.8h
-.rept 15
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_24x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_24x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    sub             x4, x4, #24
-    sub             x5, x5, #24
-    add             x0, x4, x1, lsl #5
-    add             x2, x5, x3, lsl #5
-    mov             x4, x0
-    mov             x5, x2
-.rept 3
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-.rept 4
-    bl              PFX(satd_8x8_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-    add             x4, x4, #8
-    add             x5, x5, #8
-    mov             x0, x4
-    mov             x2, x5
-.endr
-    mov             x0, x7
-    ret             x10
-endfunc
-
-.macro pixel_satd_32x8
-    mov             x4, x0
-    mov             x5, x2
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-    add             x0, x4, #16
-    add             x2, x5, #16
-.rept 2
-    bl              PFX(satd_16x4_neon)
-    add             v30.8h, v30.8h, v0.8h
-    add             v31.8h, v31.8h, v1.8h
-    add             v30.8h, v30.8h, v2.8h
-    add             v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-.macro satd_32x16_neon
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_32x8
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-.endm
-
-.macro satd_64x16_neon
-    mov             x8, x0
-    mov             x9, x2
-    satd_32x16_neon
-    add             x7, x7, x6
-    add             x0, x8, #32
-    add             x2, x9, #32
-    satd_32x16_neon
-    add             x7, x7, x6
-.endm
-
-function PFX(pixel_satd_32x8_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x4, x0
-    mov             x5, x2
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x16_neon)
-    mov             x10, x30
-    satd_32x16_neon
-    mov             x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x24_neon)
-    mov             x10, x30
-    satd_32x16_neon
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    pixel_satd_32x8
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    add             x0, x0, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x48_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_32x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 3
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x16_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x32_neon)
-    mov             x10, x30
-    mov             x7, #0
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x48_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 2
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_64x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-.rept 3
-    satd_64x16_neon
-    sub             x0, x0, #48
-    sub             x2, x2, #48
-.endr
-    satd_64x16_neon
-    mov             x0, x7
-    ret             x10
-endfunc
-
-function PFX(pixel_satd_48x64_neon)
-    mov             x10, x30
-    mov             x7, #0
-    mov             x8, x0
-    mov             x9, x2
-.rept 3
-    satd_32x16_neon
-    sub             x0, x0, #16
-    sub             x2, x2, #16
-    add             x7, x7, x6
-.endr
-    satd_32x16_neon
-    add             x7, x7, x6
-
-    add             x0, x8, #32
-    add             x2, x9, #32
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x7, x7, x6
-
-    movi            v30.8h, #0
-    movi            v31.8h, #0
-    pixel_satd_16x32_neon
-    add             v0.8h, v30.8h, v31.8h
-    uaddlv          s0, v0.8h
-    mov             w6, v0.s[0]
-    add             x0, x7, x6
-    ret             x10
-endfunc
-
-function PFX(sa8d_8x8_neon), export=0
-    LOAD_DIFF_8x4   v16.8h, v17.8h, v18.8h, v19.8h
-    LOAD_DIFF_8x4   v20.8h, v21.8h, v22.8h, v23.8h
-    HADAMARD4_V     v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
-    HADAMARD4_V     v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    SUMSUB_ABCD     v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
-    SUMSUB_ABCD     v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
-    trn4            v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
-    trn4            v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
-    SUMSUB_ABCD     v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
-    SUMSUB_ABCD     v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
-    trn4            v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
-    trn4            v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
-    SUMSUB_ABCD     v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
-    SUMSUB_ABCD     v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
-    trn4            v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
-    trn4            v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
-    ABS8            v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
-    smax            v16.8h, v16.8h, v20.8h
-    smax            v17.8h, v17.8h, v21.8h
-    smax            v18.8h, v18.8h, v22.8h
-    smax            v19.8h, v19.8h, v23.8h
-    add             v0.8h, v16.8h, v17.8h
-    add             v1.8h, v18.8h, v19.8h
-    ret
-endfunc
-
-function PFX(pixel_sa8d_8x8_neon)
-    mov             x10, x30
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w0, v0.s[0]
-    add             w0, w0, #1
-    lsr             w0, w0, #1
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_8x16_neon)
-    mov             x10, x30
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w5, v0.s[0]
-    add             w5, w5, #1
-    lsr             w5, w5, #1
-    bl              PFX(sa8d_8x8_neon)
-    add             v0.8h, v0.8h, v1.8h
-    uaddlv          s0, v0.8h
-    mov             w4, v0.s[0]
-    add             w4, w4, #1
-    lsr             w4, w4, #1
-    add             w0, w4, w5
-    ret             x10
-endfunc
-
-.macro sa8d_16x16 reg
-    bl              PFX(sa8d_8x8_neon)
-    uaddlp          v30.4s, v0.8h
-    uaddlp          v31.4s, v1.8h
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    bl              PFX(sa8d_8x8_neon)
-    uadalp          v30.4s, v0.8h
-    uadalp          v31.4s, v1.8h
-    add             v0.4s, v30.4s, v31.4s
-    addv            s0, v0.4s
-    mov             \reg, v0.s[0]
-    add             \reg, \reg, #1
-    lsr             \reg, \reg, #1
-.endm
-
-function PFX(pixel_sa8d_16x16_neon)
-    mov             x10, x30
-    sa8d_16x16      w0
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_16x32_neon)
-    mov             x10, x30
-    sa8d_16x16      w4
-    sub             x0, x0, #8
-    sub             x2, x2, #8
-    sa8d_16x16      w5
-    add             w0, w4, w5
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_32x32_neon)
-    mov             x10, x30
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    sub             x0, x0, #24
-    sub             x2, x2, #24
-    sa8d_16x16      w6
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w7
-    add             w4, w4, w5
-    add             w6, w6, w7
-    add             w0, w4, w6
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_32x64_neon)
-    mov             x10, x30
-    mov             w11, #4
-    mov             w9, #0
-.Loop_sa8d_32:
-    sub             w11, w11, #1
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    add             w4, w4, w5
-    add             w9, w9, w4
-    sub             x0, x0, #24
-    sub             x2, x2, #24
-    cbnz            w11, .Loop_sa8d_32
-    mov             w0, w9
-    ret             x10
-endfunc
-
-function PFX(pixel_sa8d_64x64_neon)
-    mov             x10, x30
-    mov             w11, #4
-    mov             w9, #0
-.Loop_sa8d_64:
-    sub             w11, w11, #1
-    sa8d_16x16      w4
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w5
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w6
-    sub             x0, x0, x1, lsl #4
-    sub             x2, x2, x3, lsl #4
-    add             x0, x0, #8
-    add             x2, x2, #8
-    sa8d_16x16      w7
-    add             w4, w4, w5
-    add             w6, w6, w7
-    add             w8, w4, w6
-    add             w9, w9, w8
-
-    sub             x0, x0, #56
-    sub             x2, x2, #56
-    cbnz            w11, .Loop_sa8d_64
-    mov             w0, w9
-    ret             x10
-endfunc
-
 /***** dequant_scaling*****/
 // void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
 function PFX(dequant_scaling_neon)
-- 
2.39.5 (Apple Git-154)



More information about the x265-devel mailing list