[x265] [PATCH 1/3] AArch64: Clean up satd/sa8d functions
Li Zhang
li.zhang2 at arm.com
Wed Apr 30 18:17:36 UTC 2025
Clean up and optimize the Neon intrinsics implementation of the
satd/sa8d primitives for all bitdepths.
Remove the Neon and SVE assembly implementations of these primitives
since they are now slower than the Neon intrinsics implementations.
---
source/common/aarch64/asm-primitives.cpp | 76 --
source/common/aarch64/mem-neon.h | 6 +-
source/common/aarch64/pixel-prim.cpp | 1407 +++++++++++-----------
source/common/aarch64/pixel-util-sve.S | 258 ----
source/common/aarch64/pixel-util.S | 957 ---------------
5 files changed, 712 insertions(+), 1992 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6097f7655..4d2c575d1 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -652,64 +652,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
- // satd
- ALL_LUMA_PU(satd, pixel_satd, neon);
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon);
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon);
-
- // sa8d
- p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon);
- p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
- p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_neon);
@@ -857,24 +799,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
#if !HIGH_BIT_DEPTH
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
-
- // satd
- p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
- p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
- p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_sve);
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_sve);
-
- // sa8d
- p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
#else // HIGH_BIT_DEPTH
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 263c1d569..8bd5fbee9 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -106,8 +106,7 @@ static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
{
for (int i = 0; i < N; ++i)
{
- dst[i] = vld1q_u8(src);
- src += stride;
+ dst[i] = vld1q_u8(src + i * stride);
}
}
@@ -230,8 +229,7 @@ static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
{
for (int i = 0; i < N; ++i)
{
- dst[i] = vld1q_u16(src);
- src += stride;
+ dst[i] = vld1q_u16(src + i * stride);
}
}
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 15ccdff22..67c388b59 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -19,799 +19,805 @@ namespace
{
-/* SATD SA8D variants - based on x264 */
-static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+static inline void sumsubq_s16(int16x8_t *sum, int16x8_t *sub, const int16x8_t a, const int16x8_t b)
{
- sum = vaddq_s16(a, b);
- sub = vsubq_s16(a, b);
+ *sum = vaddq_s16(a, b);
+ *sub = vsubq_s16(a, b);
}
-static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s16x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
- t1 = vtrn1q_s16(s1, s2);
- t2 = vtrn2q_s16(s1, s2);
+ *t1 = vtrn1q_s16(s1, s2);
+ *t2 = vtrn2q_s16(s1, s2);
}
-static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s32x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
- t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
- t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
+ *t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+ *t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
}
-static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s64x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
- t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
- t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+ *t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+ *t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
}
-static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
- int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d)
+static inline uint16x8_t max_abs_s16(const int16x8_t a, const int16x8_t b)
{
- SUMSUB_AB(s1, d1, a, b);
- SUMSUB_AB(s2, d2, c, d);
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(a));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(b));
+
+ return vmaxq_u16(abs0, abs1);
}
-static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
- int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+#if X265_DEPTH == 12
+static inline void sumsubq_s32(int32x4_t *sum, int32x4_t *sub, const int32x4_t a, const int32x4_t b)
{
- SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
- SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+ *sum = vaddq_s32(a, b);
+ *sub = vsubq_s32(a, b);
}
-
-static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
-
+static inline void sumsublq_s16(int32x4_t *sum_lo, int32x4_t *sum_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi,
+ const int16x8_t a, const int16x8_t b)
{
+ *sum_lo = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+ *sub_lo = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+ *sum_hi = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+ *sub_hi = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+}
- int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
-
-
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
-
- SUMSUB_AB(v4 , v6 , v16, v18);
- SUMSUB_AB(v5 , v7 , v17, v19);
-
- transpose_8h_8h(v0, v1, v4, v5);
- transpose_8h_8h(v2, v3, v6, v7);
+static inline void transpose_inplace_s32_s64x2(int32x4_t *t1, int32x4_t *t2)
+{
+ int64x2_t tmp1 = vreinterpretq_s64_s32(*t1);
+ int64x2_t tmp2 = vreinterpretq_s64_s32(*t2);
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
+ *t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+ *t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+}
- transpose_4s_8h(v0, v1, v16, v18);
- transpose_4s_8h(v2, v3, v17, v19);
+static inline uint32x4_t max_abs_s32(int32x4_t a, int32x4_t b)
+{
+ uint32x4_t abs0 = vreinterpretq_u32_s32(vabsq_s32(a));
+ uint32x4_t abs1 = vreinterpretq_u32_s32(vabsq_s32(b));
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+ return vmaxq_u32(abs0, abs1);
+}
- uint16x8_t max0 = vmaxq_u16(abs0, abs1);
- uint16x8_t max1 = vmaxq_u16(abs2, abs3);
+#endif // X265_DEPTH == 12
- uint16x8_t sum = vaddq_u16(max0, max1);
- return vaddlvq_u16(sum);
+#if HIGH_BIT_DEPTH
+static inline void load_diff_u16x8x4(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+ uint16x8_t r[4], t[4];
+ load_u16x8xn<4>(pix1, stride_pix1, r);
+ load_u16x8xn<4>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
}
-static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+static inline void load_diff_u16x8x4_dual(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
{
- int16x8_t v2, v3;
- SUMSUB_AB(v2, v3, v0, v1);
-
- transpose_2d_8h(v0, v1, v2, v3);
- SUMSUB_AB(v2, v3, v0, v1);
-
- transpose_8h_8h(v0, v1, v2, v3);
- SUMSUB_AB(v2, v3, v0, v1);
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
+}
- transpose_4s_8h(v0, v1, v2, v3);
+static inline void load_diff_u16x8x8(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+ uint16x8_t r[8], t[8];
+ load_u16x8xn<8>(pix1, stride_pix1, r);
+ load_u16x8xn<8>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
+ diff[4] = vreinterpretq_s16_u16(vsubq_u16(r[4], t[4]));
+ diff[5] = vreinterpretq_s16_u16(vsubq_u16(r[5], t[5]));
+ diff[6] = vreinterpretq_s16_u16(vsubq_u16(r[6], t[6]));
+ diff[7] = vreinterpretq_s16_u16(vsubq_u16(r[7], t[7]));
+}
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t max = vmaxq_u16(abs0, abs1);
+#else // !HIGH_BIT_DEPTH
+static inline void load_diff_u8x8x4(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+ uint8x8_t r[4], t[4];
+ load_u8x8xn<4>(pix1, stride_pix1, r);
+ load_u8x8xn<4>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(r[3], t[3]));
+}
- return vaddlvq_u16(max);
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
}
-static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
- int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline void load_diff_u8x16x4(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
{
- int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+ uint8x16_t s1[4], s2[4];
+ load_u8x16xn<4>(pix1, stride_pix1, s1);
+ load_u8x16xn<4>(pix2, stride_pix2, s2);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[0]), vget_low_u8(s2[0])));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[1]), vget_low_u8(s2[1])));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[2]), vget_low_u8(s2[2])));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[3]), vget_low_u8(s2[3])));
+ diff[4] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[0]), vget_high_u8(s2[0])));
+ diff[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[1]), vget_high_u8(s2[1])));
+ diff[6] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[2]), vget_high_u8(s2[2])));
+ diff[7] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[3]), vget_high_u8(s2[3])));
+}
- SUMSUB_AB(v16, v18, v0, v2);
- SUMSUB_AB(v17, v19, v1, v3);
+#endif // HIGH_BIT_DEPTH
- HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
+// 4 way hadamard vertical pass.
+static inline void hadamard_4_v(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
+{
+ int16x8_t s0, s1, d0, d1;
- transpose_8h_8h(v0, v1, v16, v17);
- transpose_8h_8h(v2, v3, v18, v19);
- transpose_8h_8h(v4, v5, v20, v21);
- transpose_8h_8h(v6, v7, v22, v23);
+ sumsubq_s16(&s0, &d0, in_coefs[0], in_coefs[1]);
+ sumsubq_s16(&s1, &d1, in_coefs[2], in_coefs[3]);
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
- SUMSUB_AB(v20, v21, v4, v5);
- SUMSUB_AB(v22, v23, v6, v7);
+ sumsubq_s16(&out_coefs[0], &out_coefs[2], s0, s1);
+ sumsubq_s16(&out_coefs[1], &out_coefs[3], d0, d1);
+}
- transpose_4s_8h(v0, v2, v16, v18);
- transpose_4s_8h(v1, v3, v17, v19);
- transpose_4s_8h(v4, v6, v20, v22);
- transpose_4s_8h(v5, v7, v21, v23);
+// 8 way hadamard vertical pass.
+static inline void hadamard_8_v(const int16x8_t in_coefs[8], int16x8_t out_coefs[8])
+{
+ int16x8_t temp[8];
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
- uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
- uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
- uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
- uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+ hadamard_4_v(in_coefs, temp);
+ hadamard_4_v(in_coefs + 4, temp + 4);
- v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
- v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
- v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
- v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
+ sumsubq_s16(&out_coefs[0], &out_coefs[4], temp[0], temp[4]);
+ sumsubq_s16(&out_coefs[1], &out_coefs[5], temp[1], temp[5]);
+ sumsubq_s16(&out_coefs[2], &out_coefs[6], temp[2], temp[6]);
+ sumsubq_s16(&out_coefs[3], &out_coefs[7], temp[3], temp[7]);
}
-#if HIGH_BIT_DEPTH
-
-#if (X265_DEPTH > 10)
-static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
- const int32x4_t s1, const int32x4_t s2)
+// 4 way hadamard horizontal pass.
+static inline void hadamard_4_h(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
{
- int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
- int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+ int16x8_t s0, s1, d0, d1, t0, t1, t2, t3;
+
+ transpose_s16_s16x2(&t0, &t1, in_coefs[0], in_coefs[1]);
+ transpose_s16_s16x2(&t2, &t3, in_coefs[2], in_coefs[3]);
+
+ sumsubq_s16(&s0, &d0, t0, t1);
+ sumsubq_s16(&s1, &d1, t2, t3);
- t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
- t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+ transpose_s16_s32x2(&out_coefs[0], &out_coefs[1], s0, s1);
+ transpose_s16_s32x2(&out_coefs[2], &out_coefs[3], d0, d1);
}
-static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+#if X265_DEPTH != 12
+// 8 way hadamard horizontal pass.
+static inline void hadamard_8_h(int16x8_t coefs[8], uint16x8_t out[4])
{
- sum = vaddq_s32(a, b);
- sub = vsubq_s32(a, b);
+ int16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ int16x8_t temp[8];
+
+ hadamard_4_h(coefs, temp);
+ hadamard_4_h(coefs + 4, temp + 4);
+
+ sumsubq_s16(&s0, &d0, temp[0], temp[1]);
+ sumsubq_s16(&s1, &d1, temp[2], temp[3]);
+ sumsubq_s16(&s2, &d2, temp[4], temp[5]);
+ sumsubq_s16(&s3, &d3, temp[6], temp[7]);
+
+ transpose_s16_s64x2(&temp[0], &temp[1], s0, s2);
+ transpose_s16_s64x2(&temp[2], &temp[3], s1, s3);
+ transpose_s16_s64x2(&temp[4], &temp[5], d0, d2);
+ transpose_s16_s64x2(&temp[6], &temp[7], d1, d3);
+
+ out[0] = max_abs_s16(temp[0], temp[1]);
+ out[1] = max_abs_s16(temp[2], temp[3]);
+ out[2] = max_abs_s16(temp[4], temp[5]);
+ out[3] = max_abs_s16(temp[6], temp[7]);
}
-static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
- const int16x8_t a, const int16x8_t b)
+#else // X265_DEPTH == 12
+static inline void hadamard_8_h(int16x8_t coefs[8], uint32x4_t out[4])
{
- suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
- sumh = vaddl_high_s16(a, b);
- subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
- subh = vsubl_high_s16(a, b);
+ int16x8_t a[8];
+
+ transpose_s16_s16x2(&a[0], &a[1], coefs[0], coefs[1]);
+ transpose_s16_s16x2(&a[2], &a[3], coefs[2], coefs[3]);
+ transpose_s16_s16x2(&a[4], &a[5], coefs[4], coefs[5]);
+ transpose_s16_s16x2(&a[6], &a[7], coefs[6], coefs[7]);
+
+ int32x4_t a_lo[8], a_hi[8], b_lo[8], b_hi[8];
+
+ sumsublq_s16(&a_lo[0], &a_hi[0], &a_lo[4], &a_hi[4], a[0], a[1]);
+ sumsublq_s16(&a_lo[1], &a_hi[1], &a_lo[5], &a_hi[5], a[2], a[3]);
+ sumsublq_s16(&a_lo[2], &a_hi[2], &a_lo[6], &a_hi[6], a[4], a[5]);
+ sumsublq_s16(&a_lo[3], &a_hi[3], &a_lo[7], &a_hi[7], a[6], a[7]);
+
+ transpose_inplace_s32_s64x2(&a_lo[0], &a_lo[1]);
+ transpose_inplace_s32_s64x2(&a_lo[2], &a_lo[3]);
+ transpose_inplace_s32_s64x2(&a_lo[4], &a_lo[5]);
+ transpose_inplace_s32_s64x2(&a_lo[6], &a_lo[7]);
+
+ transpose_inplace_s32_s64x2(&a_hi[0], &a_hi[1]);
+ transpose_inplace_s32_s64x2(&a_hi[2], &a_hi[3]);
+ transpose_inplace_s32_s64x2(&a_hi[4], &a_hi[5]);
+ transpose_inplace_s32_s64x2(&a_hi[6], &a_hi[7]);
+
+ sumsubq_s32(&b_lo[0], &b_lo[1], a_lo[0], a_lo[1]);
+ sumsubq_s32(&b_lo[2], &b_lo[3], a_lo[2], a_lo[3]);
+ sumsubq_s32(&b_lo[4], &b_lo[5], a_lo[4], a_lo[5]);
+ sumsubq_s32(&b_lo[6], &b_lo[7], a_lo[6], a_lo[7]);
+
+ sumsubq_s32(&b_hi[0], &b_hi[1], a_hi[0], a_hi[1]);
+ sumsubq_s32(&b_hi[2], &b_hi[3], a_hi[2], a_hi[3]);
+ sumsubq_s32(&b_hi[4], &b_hi[5], a_hi[4], a_hi[5]);
+ sumsubq_s32(&b_hi[6], &b_hi[7], a_hi[6], a_hi[7]);
+
+ uint32x4_t max0_lo = max_abs_s32(b_lo[0], b_hi[0]);
+ uint32x4_t max1_lo = max_abs_s32(b_lo[1], b_hi[1]);
+ uint32x4_t max2_lo = max_abs_s32(b_lo[2], b_hi[2]);
+ uint32x4_t max3_lo = max_abs_s32(b_lo[3], b_hi[3]);
+ uint32x4_t max0_hi = max_abs_s32(b_lo[4], b_hi[4]);
+ uint32x4_t max1_hi = max_abs_s32(b_lo[5], b_hi[5]);
+ uint32x4_t max2_hi = max_abs_s32(b_lo[6], b_hi[6]);
+ uint32x4_t max3_hi = max_abs_s32(b_lo[7], b_hi[7]);
+
+ out[0] = vaddq_u32(max0_lo, max0_hi);
+ out[1] = vaddq_u32(max1_lo, max1_hi);
+ out[2] = vaddq_u32(max2_lo, max2_hi);
+ out[3] = vaddq_u32(max3_lo, max3_hi);
}
-#endif
+#endif // X265_DEPTH != 12
-static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
- int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int hadamard_4x4(int16x8_t a0, int16x8_t a1)
{
- uint16x8_t r0, r1, r2, r3;
- uint16x8_t t0, t1, t2, t3;
- int16x8_t v16, v17;
- int16x8_t v18, v19;
-
- r0 = vld1q_u16(pix1 + 0 * stride_pix1);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1);
-
- t0 = vld1q_u16(pix2 + 0 * stride_pix2);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2);
-
- v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ int16x8_t sum, dif, t0, t1;
+ sumsubq_s16(&sum, &dif, a0, a1);
- r0 = vld1q_u16(pix1 + 4 * stride_pix1);
- r1 = vld1q_u16(pix1 + 5 * stride_pix1);
- r2 = vld1q_u16(pix1 + 6 * stride_pix1);
- r3 = vld1q_u16(pix1 + 7 * stride_pix1);
+ transpose_s16_s64x2(&t0, &t1, sum, dif);
+ sumsubq_s16(&sum, &dif, t0, t1);
- t0 = vld1q_u16(pix2 + 4 * stride_pix2);
- t1 = vld1q_u16(pix2 + 5 * stride_pix2);
- t2 = vld1q_u16(pix2 + 6 * stride_pix2);
- t3 = vld1q_u16(pix2 + 7 * stride_pix2);
+ transpose_s16_s16x2(&t0, &t1, sum, dif);
+ sumsubq_s16(&sum, &dif, t0, t1);
- v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ transpose_s16_s32x2(&t0, &t1, sum, dif);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ uint16x8_t max = max_abs_s16(t0, t1);
+ return vaddlvq_u16(max);
}
-
-
-
-static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+// Calculate 2 4x4 hadamard transformation.
+static void hadamard_4x4_dual(int16x8_t diff[4], uint16x8_t *out)
{
- uint16x8_t r0, r1, r2, r3;
- uint16x8_t t0, t1, t2, t3;
- int16x8_t v16, v17, v20, v21;
- int16x8_t v18, v19, v22, v23;
+ int16x8_t temp[4];
- r0 = vld1q_u16(pix1 + 0 * stride_pix1);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1);
+ hadamard_4_v(diff, temp);
+ hadamard_4_h(temp, diff);
- t0 = vld1q_u16(pix2 + 0 * stride_pix2);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2);
+ uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+ uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
- v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
-
- r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1 + 8);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1 + 8);
+ *out = vaddq_u16(sum0, sum1);
+}
- t0 = vld1q_u16(pix2 + 0 * stride_pix2 + 8);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2 + 8);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8);
+// Calculate 4 4x4 hadamard transformation.
+static inline void hadamard_4x4_quad(int16x8_t diff[8], uint16x8_t out[2])
+{
+ int16x8_t temp[8];
- v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ hadamard_4_v(diff, temp);
+ hadamard_4_v(diff + 4, temp + 4);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ hadamard_4_h(temp, diff);
+ hadamard_4_h(temp + 4, diff + 4);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+ uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+ uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
+ uint16x8_t sum2 = max_abs_s16(diff[4], diff[5]);
+ uint16x8_t sum3 = max_abs_s16(diff[6], diff[7]);
+ out[0] = vaddq_u16(sum0, sum1);
+ out[1] = vaddq_u16(sum2, sum3);
}
-
-int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if X265_DEPTH == 8
+static inline void hadamard_8x8(int16x8_t diff[8], uint16x8_t out[2])
{
- uint16x4_t t0_0 = vld1_u16(pix1 + 0 * stride_pix1);
- uint16x4_t t1_0 = vld1_u16(pix1 + 1 * stride_pix1);
- uint16x4_t t0_1 = vld1_u16(pix1 + 2 * stride_pix1);
- uint16x4_t t1_1 = vld1_u16(pix1 + 3 * stride_pix1);
- uint16x8_t t0 = vcombine_u16(t0_0, t0_1);
- uint16x8_t t1 = vcombine_u16(t1_0, t1_1);
+ int16x8_t temp[8];
+ uint16x8_t sum[4];
- uint16x4_t r0_0 = vld1_u16(pix2 + 0 * stride_pix2);
- uint16x4_t r1_0 = vld1_u16(pix2 + 1 * stride_pix2);
- uint16x4_t r0_1 = vld1_u16(pix2 + 2 * stride_pix2);
- uint16x4_t r1_1 = vld1_u16(pix2 + 3 * stride_pix2);
- uint16x8_t r0 = vcombine_u16(r0_0, r0_1);
- uint16x8_t r1 = vcombine_u16(r1_0, r1_1);
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
- int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-
- return _satd_4x4_neon(v0, v1);
+ out[0] = vaddq_u16(sum[0], sum[1]);
+ out[1] = vaddq_u16(sum[2], sum[3]);
}
+#elif X265_DEPTH == 10
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
+{
+ int16x8_t temp[8];
+ uint16x8_t sum[4];
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
+ out[0] = vpaddlq_u16(sum[0]);
+ out[1] = vpaddlq_u16(sum[1]);
+ out[0] = vpadalq_u16(out[0], sum[2]);
+ out[1] = vpadalq_u16(out[1], sum[3]);
+}
-
-
-int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#elif X265_DEPTH == 12
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
{
- uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
-
- i0 = vld1q_u16(pix1 + 0 * stride_pix1);
- i1 = vld1q_u16(pix2 + 0 * stride_pix2);
- i2 = vld1q_u16(pix1 + 1 * stride_pix1);
- i3 = vld1q_u16(pix2 + 1 * stride_pix2);
- i4 = vld1q_u16(pix1 + 2 * stride_pix1);
- i5 = vld1q_u16(pix2 + 2 * stride_pix2);
- i6 = vld1q_u16(pix1 + 3 * stride_pix1);
- i7 = vld1q_u16(pix2 + 3 * stride_pix2);
+ int16x8_t temp[8];
+ uint32x4_t sum[4];
- int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3));
- int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5));
- int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7));
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
- return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+ out[0] = vaddq_u32(sum[0], sum[1]);
+ out[1] = vaddq_u32(sum[2], sum[3]);
}
+#endif // X265_DEPTH == 8
-int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
- int16x8_t v0, v1, v2, v3;
+ uint16x4_t s[4], r[4];
+ load_u16x4xn<4>(pix1, stride_pix1, s);
+ load_u16x4xn<4>(pix2, stride_pix2, r);
- for (int offset = 0; offset <= 12; offset += 4)
- {
- _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1,
- pix2 + offset * stride_pix2,stride_pix2,
- v0, v1, v2, v3);
- v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0));
- v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1));
- v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2));
- v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3));
- }
+ uint16x8_t s0 = vcombine_u16(s[0], s[2]);
+ uint16x8_t s1 = vcombine_u16(s[1], s[3]);
+ uint16x8_t r0 = vcombine_u16(r[0], r[2]);
+ uint16x8_t r1 = vcombine_u16(r[1], r[3]);
+
+ int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
- return vaddvq_u32(vaddq_u32(v30, v31));
+ return hadamard_4x4(diff0, diff1);
}
-#else //HIGH_BIT_DEPTH
+static inline int pixel_satd_4x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[4];
+
+ uint16x4_t s[8], r[8];
+ load_u16x4xn<8>(pix1, stride_pix1, s);
+ load_u16x4xn<8>(pix2, stride_pix2, r);
+
+ uint16x8_t s0 = vcombine_u16(s[0], s[4]);
+ uint16x8_t s1 = vcombine_u16(s[1], s[5]);
+ uint16x8_t s2 = vcombine_u16(s[2], s[6]);
+ uint16x8_t s3 = vcombine_u16(s[3], s[7]);
+ uint16x8_t r0 = vcombine_u16(r[0], r[4]);
+ uint16x8_t r1 = vcombine_u16(r[1], r[5]);
+ uint16x8_t r2 = vcombine_u16(r[2], r[6]);
+ uint16x8_t r3 = vcombine_u16(r[3], r[7]);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r3, s3));
+
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+
+ return vaddlvq_u16(out);
+}
-static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x16_t r0, r1, r2, r3;
- uint8x16_t t0, t1, t2, t3;
- int16x8_t v16, v17, v20, v21;
- int16x8_t v18, v19, v22, v23;
+ int16x8_t diff[4];
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
- r0 = vld1q_u8(pix1 + 0 * stride_pix1);
- r1 = vld1q_u8(pix1 + 1 * stride_pix1);
- r2 = vld1q_u8(pix1 + 2 * stride_pix1);
- r3 = vld1q_u8(pix1 + 3 * stride_pix1);
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
- t0 = vld1q_u8(pix2 + 0 * stride_pix2);
- t1 = vld1q_u8(pix2 + 1 * stride_pix2);
- t2 = vld1q_u8(pix2 + 2 * stride_pix2);
- t3 = vld1q_u8(pix2 + 3 * stride_pix2);
+ return vaddlvq_u16(out);
+}
- v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)));
- v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)));
- v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)));
- v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)));
- v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3));
+static inline int pixel_satd_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+ uint32x4_t res = vpaddlq_u16(out[0]);
+ res = vpadalq_u16(res, out[1]);
+ return vaddvq_u32(res);
}
-
-static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
- int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int pixel_satd_8x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x8_t r0, r1, r2, r3;
- uint8x8_t t0, t1, t2, t3;
- int16x8_t v16, v17;
- int16x8_t v18, v19;
+ int16x8_t diff[16];
+ uint16x8_t out[4];
+
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4_dual(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
- r0 = vld1_u8(pix1 + 0 * stride_pix1);
- r1 = vld1_u8(pix1 + 1 * stride_pix1);
- r2 = vld1_u8(pix1 + 2 * stride_pix1);
- r3 = vld1_u8(pix1 + 3 * stride_pix1);
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
- t0 = vld1_u8(pix2 + 0 * stride_pix2);
- t1 = vld1_u8(pix2 + 1 * stride_pix2);
- t2 = vld1_u8(pix2 + 2 * stride_pix2);
- t3 = vld1_u8(pix2 + 3 * stride_pix2);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
- v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+ uint32x4_t res = vpaddlq_u16(sum0);
+ res = vpadalq_u16(res, sum1);
- r0 = vld1_u8(pix1 + 4 * stride_pix1);
- r1 = vld1_u8(pix1 + 5 * stride_pix1);
- r2 = vld1_u8(pix1 + 6 * stride_pix1);
- r3 = vld1_u8(pix1 + 7 * stride_pix1);
+ return vaddvq_u32(res);
+}
- t0 = vld1_u8(pix2 + 4 * stride_pix2);
- t1 = vld1_u8(pix2 + 5 * stride_pix2);
- t2 = vld1_u8(pix2 + 6 * stride_pix2);
- t3 = vld1_u8(pix2 + 7 * stride_pix2);
+static inline int pixel_satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
- v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
+ uint16x8_t sum0, sum1;
+ hadamard_4x4_dual(diff, &sum0);
+ hadamard_4x4_dual(diff + 4, &sum1);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ sum0 = vaddq_u16(sum0, sum1);
+ return vaddlvq_u16(sum0);
}
-int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x8_t t0 = load_u8x4x2(pix1, 2 * stride_pix1);
- uint8x8_t t1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
- uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4_dual(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 8);
- return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)),
- vreinterpretq_s16_u16(vsubl_u8(r1, t1)));
-}
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
+#if X265_DEPTH == 10
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
-int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
-{
- uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
+ sum0 = vaddq_u16(sum0, sum1);
- i0 = vld1_u8(pix1 + 0 * stride_pix1);
- i1 = vld1_u8(pix2 + 0 * stride_pix2);
- i2 = vld1_u8(pix1 + 1 * stride_pix1);
- i3 = vld1_u8(pix2 + 1 * stride_pix2);
- i4 = vld1_u8(pix1 + 2 * stride_pix1);
- i5 = vld1_u8(pix2 + 2 * stride_pix2);
- i6 = vld1_u8(pix1 + 3 * stride_pix1);
- i7 = vld1_u8(pix2 + 3 * stride_pix2);
+ return vaddlvq_u16(sum0);
+#else // X265_DEPTH == 12
+ uint32x4_t sum0 = vpaddlq_u16(out[0]);
+ uint32x4_t sum1 = vpaddlq_u16(out[1]);
+ sum0 = vpadalq_u16(sum0, out[2]);
+ sum1 = vpadalq_u16(sum1, out[3]);
- int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3));
- int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5));
- int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7));
+ sum0 = vaddq_u32(sum0, sum1);
- return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+ return vaddvq_u32(sum0);
+#endif // X265_DEPTH == 10
}
-int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint16x8_t v30, v31;
- int16x8_t v0, v1, v2, v3;
- uint16x8_t t0, t1;
+ uint32x4_t sum[2]= { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
- v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ for (int i = 0; i < 4; ++i)
+ {
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
- _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ hadamard_4x4_quad(diff, out);
- _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ sum[0] = vpadalq_u16(sum[0], out[0]);
+ sum[1] = vpadalq_u16(sum[1], out[1]);
- _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ pix1 += 4 * stride_pix1;
+ pix2 += 4 * stride_pix2;
+ }
- uint32x4_t sum0 = vpaddlq_u16(v30);
- uint32x4_t sum1 = vpaddlq_u16(v31);
- sum0 = vaddq_u32(sum0, sum1);
- return vaddvq_u32(sum0);
+ return vaddvq_u32(vaddq_u32(sum[0], sum[1]));
}
-#endif //HIGH_BIT_DEPTH
-#if HIGH_BIT_DEPTH
-typedef uint32x4_t sa8d_out_type;
-#else
-typedef uint16x8_t sa8d_out_type;
-#endif
-
-static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
- int16x8_t v3, int16x8_t v20,
- int16x8_t v21, int16x8_t v22,
- int16x8_t v23, sa8d_out_type &out0,
- sa8d_out_type &out1)
+static inline int pixel_sa8d_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v16, v17, v18, v19;
- int16x8_t v4, v5, v6, v7;
+ int16x8_t diff[8];
+ uint32x4_t res[2];
- SUMSUB_AB(v16, v18, v0, v2);
- SUMSUB_AB(v17, v19, v1, v3);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
- HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
+ uint32x4_t s = vaddq_u32(res[0], res[1]);
- SUMSUB_AB(v0, v16, v16, v20);
- SUMSUB_AB(v1, v17, v17, v21);
- SUMSUB_AB(v2, v18, v18, v22);
- SUMSUB_AB(v3, v19, v19, v23);
-
- transpose_8h_8h(v20, v21, v16, v17);
- transpose_8h_8h(v4, v5, v0, v1);
- transpose_8h_8h(v22, v23, v18, v19);
- transpose_8h_8h(v6, v7, v2, v3);
+ return (vaddvq_u32(s) + 1) >> 1;
+}
-#if (X265_DEPTH <= 10)
+static inline int pixel_sa8d_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ uint32x4_t sum0, sum1;
- int16x8_t v24, v25;
+ int16x8_t diff[8];
+ uint32x4_t res[2];
- SUMSUB_AB(v2, v3, v20, v21);
- SUMSUB_AB(v24, v25, v4, v5);
- SUMSUB_AB(v0, v1, v22, v23);
- SUMSUB_AB(v4, v5, v6, v7);
+ load_diff_u16x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(res[0], res[1]);
- transpose_4s_8h(v20, v22, v2, v0);
- transpose_4s_8h(v21, v23, v3, v1);
- transpose_4s_8h(v16, v18, v24, v4);
- transpose_4s_8h(v17, v19, v25, v5);
+ load_diff_u16x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum1 = vaddq_u32(res[0], res[1]);
- SUMSUB_AB(v0, v2, v20, v22);
- SUMSUB_AB(v1, v3, v21, v23);
- SUMSUB_AB(v4, v6, v16, v18);
- SUMSUB_AB(v5, v7, v17, v19);
+ load_diff_u16x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(sum0, res[0]);
+ sum1 = vaddq_u32(sum1, res[1]);
- transpose_2d_8h(v16, v20, v0, v4);
- transpose_2d_8h(v17, v21, v1, v5);
- transpose_2d_8h(v18, v22, v2, v6);
- transpose_2d_8h(v19, v23, v3, v7);
+ load_diff_u16x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+ pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(sum0, res[0]);
+ sum1 = vaddq_u32(sum1, res[1]);
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19));
- uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20));
- uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21));
- uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22));
- uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23));
+ sum0 = vaddq_u32(sum0, sum1);
- uint16x8_t max0 = vmaxq_u16(abs0, abs4);
- uint16x8_t max1 = vmaxq_u16(abs1, abs5);
- uint16x8_t max2 = vmaxq_u16(abs2, abs6);
- uint16x8_t max3 = vmaxq_u16(abs3, abs7);
+ return (vaddvq_u32(sum0) + 1) >> 1;
+}
-#if HIGH_BIT_DEPTH
- out0 = vpaddlq_u16(max0);
- out1 = vpaddlq_u16(max1);
- out0 = vpadalq_u16(out0, max2);
- out1 = vpadalq_u16(out1, max3);
-
-#else //HIGH_BIT_DEPTH
-
- out0 = vaddq_u16(max0, max1);
- out1 = vaddq_u16(max2, max3);
-
-#endif //HIGH_BIT_DEPTH
-
-#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
-
- int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
- int32x4_t v22l, v22h, v23l, v23h;
- int32x4_t v4l, v4h, v5l, v5h;
- int32x4_t v6l, v6h, v7l, v7h;
- int32x4_t v16l, v16h, v17l, v17h;
- int32x4_t v18l, v18h, v19l, v19h;
- int32x4_t v20l, v20h, v21l, v21h;
-
- ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
- ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
-
- v22l = vmovl_s16(vget_low_s16(v22));
- v22h = vmovl_high_s16(v22);
- v23l = vmovl_s16(vget_low_s16(v23));
- v23h = vmovl_high_s16(v23);
-
- ISUMSUB_AB(v0l, v1l, v22l, v23l);
- ISUMSUB_AB(v0h, v1h, v22h, v23h);
-
- v6l = vmovl_s16(vget_low_s16(v6));
- v6h = vmovl_high_s16(v6);
- v7l = vmovl_s16(vget_low_s16(v7));
- v7h = vmovl_high_s16(v7);
-
- ISUMSUB_AB(v4l, v5l, v6l, v7l);
- ISUMSUB_AB(v4h, v5h, v6h, v7h);
-
- transpose_2d_4s(v20l, v22l, v2l, v0l);
- transpose_2d_4s(v21l, v23l, v3l, v1l);
- transpose_2d_4s(v16l, v18l, v24l, v4l);
- transpose_2d_4s(v17l, v19l, v25l, v5l);
-
- transpose_2d_4s(v20h, v22h, v2h, v0h);
- transpose_2d_4s(v21h, v23h, v3h, v1h);
- transpose_2d_4s(v16h, v18h, v24h, v4h);
- transpose_2d_4s(v17h, v19h, v25h, v5h);
-
- ISUMSUB_AB(v0l, v2l, v20l, v22l);
- ISUMSUB_AB(v1l, v3l, v21l, v23l);
- ISUMSUB_AB(v4l, v6l, v16l, v18l);
- ISUMSUB_AB(v5l, v7l, v17l, v19l);
-
- ISUMSUB_AB(v0h, v2h, v20h, v22h);
- ISUMSUB_AB(v1h, v3h, v21h, v23h);
- ISUMSUB_AB(v4h, v6h, v16h, v18h);
- ISUMSUB_AB(v5h, v7h, v17h, v19h);
-
- v16l = v0l;
- v16h = v4l;
- v20l = v0h;
- v20h = v4h;
-
- v17l = v1l;
- v17h = v5l;
- v21l = v1h;
- v21h = v5h;
-
- v18l = v2l;
- v18h = v6l;
- v22l = v2h;
- v22h = v6h;
-
- v19l = v3l;
- v19h = v7l;
- v23l = v3h;
- v23h = v7h;
-
- uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l));
- uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l));
- uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l));
- uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l));
- uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l));
- uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l));
- uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l));
- uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l));
-
- uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h));
- uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h));
- uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h));
- uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h));
- uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h));
- uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h));
- uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h));
- uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h));
-
- uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo);
- uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo);
- uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo);
- uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo);
-
- uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi);
- uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi);
- uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi);
- uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi);
-
- uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi);
- uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi);
- uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi);
- uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi);
-
- out0 = vaddq_u32(sum0, sum1);
- out1 = vaddq_u32(sum2, sum3);
+#else // !HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ uint8x8_t s0 = load_u8x4x2(pix1, 2 * stride_pix1);
+ uint8x8_t s1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+ uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
+ uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
-#endif
+ int16x8_t diff0 = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+ int16x8_t diff1 = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+ return hadamard_4x4(diff0, diff1);
}
-
-
-static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_4x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
+ int16x8_t diff[4];
+
+ uint8x8_t s0 = load_u8x4x2(pix1 + 0 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s1 = load_u8x4x2(pix1 + 1 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s2 = load_u8x4x2(pix1 + 2 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s3 = load_u8x4x2(pix1 + 3 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t r0 = load_u8x4x2(pix2 + 0 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r1 = load_u8x4x2(pix2 + 1 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r2 = load_u8x4x2(pix2 + 2 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r3 = load_u8x4x2(pix2 + 3 * stride_pix2, 4 * stride_pix2);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(s2, r2));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(r3, s3));
+
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+
+ return vaddlvq_u16(out);
+}
- int16x8_t v20, v21, v22, v23;
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+static inline int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[4];
-}
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+ return vaddlvq_u16(out);
+}
-int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
- uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
-#if !(HIGH_BIT_DEPTH)
- uint16x8_t sum = vaddq_u16(v30, v31);
- return vaddvq_u32(vpaddlq_u16(sum));
-#else
- uint32x4_t sum = vpaddlq_u16(v30);
- sum = vpadalq_u16(sum, v31);
- return vaddvq_u32(sum);
-#endif
-}
+ out[0] = vaddq_u16(out[0], out[1]);
+ return vaddlvq_u16(out[0]);
+}
-int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
- int16x8_t v20, v21, v22, v23;
- sa8d_out_type res0, res1;
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
-
-#if HIGH_BIT_DEPTH
- uint32x4_t s = vaddq_u32(res0, res1);
- return (vaddvq_u32(s) + 1) >> 1;
-#else
- return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1;
-#endif
-}
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
+ sum0 = vaddq_u16(sum0, sum1);
+ return vaddlvq_u16(sum0);
+}
-int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
- int16x8_t v20, v21, v22, v23;
- sa8d_out_type res0, res1;
- uint32x4_t v30, v31;
+ int16x8_t diff[8];
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpaddlq_u16(res0);
- v31 = vpaddlq_u16(res1);
-#else
- v30 = vaddq_u32(res0, res1);
-#endif
+ uint16x8_t out[2];
+ hadamard_4x4_dual(diff, &out[0]);
+ hadamard_4x4_dual(diff + 4, &out[1]);
- _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ out[0] = vaddq_u16(out[0], out[1]);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v31 = vaddq_u32(res0, res1);
-#endif
+ return vaddlvq_u16(out[0]);
+}
+static inline int pixel_satd_16x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
- v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 8);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v30 = vaddq_u32(v30, res0);
- v31 = vaddq_u32(v31, res1);
-#endif
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
- _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
- v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v30 = vaddq_u32(v30, res0);
- v31 = vaddq_u32(v31, res1);
-#endif
+ sum0 = vaddq_u16(sum0, sum1);
+
+ return vaddlvq_u16(sum0);
+}
+
+static inline int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ uint16x8_t sum[2], out[2];
+ int16x8_t diff[8];
+
+ load_diff_u8x16x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = out[0];
+ sum[1] = out[1];
+
+ load_diff_u8x16x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ load_diff_u8x16x4(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ load_diff_u8x16x4(pix1 + 12 * stride_pix1, stride_pix1,
+ pix2 + 12 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ uint32x4_t sum0 = vpaddlq_u16(sum[0]);
+ uint32x4_t sum1 = vpaddlq_u16(sum[1]);
- v30 = vaddq_u32(v30, v31);
+ sum0 = vaddq_u32(sum0, sum1);
- return (vaddvq_u32(v30) + 1) >> 1;
+ return vaddvq_u32(sum0);
}
+static inline int pixel_sa8d_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t res[2];
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ return (vaddlvq_u16(vaddq_u16(res[0], res[1])) + 1) >> 1;
+}
+static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t res[2];
+ uint32x4_t sum0, sum1;
+
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpaddlq_u16(res[0]);
+ sum1 = vpaddlq_u16(res[1]);
+
+ load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+ pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+ sum0 = vaddq_u32(sum0, sum1);
+ return (vaddvq_u32(sum0) + 1) >> 1;
+}
+#endif // HIGH_BIT_DEPTH
template<int size>
void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
@@ -1425,7 +1431,7 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
template<int w, int h>
// Calculate sa8d in blocks of 8x8
-int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d8_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
{
int cost = 0;
@@ -1440,7 +1446,7 @@ int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2
template<int w, int h>
// Calculate sa8d in blocks of 16x16
-int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d16_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
{
int cost = 0;
@@ -1474,42 +1480,63 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
template<int w, int h>
-// calculate satd in blocks of 4x4
int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
int satd = 0;
- for (int row = 0; row < h; row += 4)
- for (int col = 0; col < w; col += 4)
- satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
- pix2 + row * stride_pix2 + col, stride_pix2);
+ if (w == 4 && h == 4) {
+ satd = pixel_satd_4x4_neon(pix1, stride_pix1, pix2, stride_pix2);
+ } else {
+ for (int row = 0; row < h; row += 8)
+ for (int col = 0; col < w; col += 4)
+ satd += pixel_satd_4x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
return satd;
}
template<int w, int h>
-// calculate satd in blocks of 8x4
int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
int satd = 0;
- if (((w | h) & 15) == 0)
+ if (w % 16 == 0 && h % 16 == 0)
{
for (int row = 0; row < h; row += 16)
for (int col = 0; col < w; col += 16)
satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
-
}
- else if (((w | h) & 7) == 0)
+ else if (w % 8 == 0 && h % 16 == 0)
+ {
+ for (int row = 0; row < h; row += 16)
+ for (int col = 0; col < w; col += 8)
+ satd += pixel_satd_8x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 16 == 0 && h % 8 == 0)
+ {
+ for (int row = 0; row < h; row += 8)
+ for (int col = 0; col < w; col += 16)
+ satd += pixel_satd_16x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 16 == 0 && h % 4 == 0)
+ {
+ for (int row = 0; row < h; row += 4)
+ for (int col = 0; col < w; col += 16)
+ satd += pixel_satd_16x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 8 == 0 && h % 8 == 0)
{
for (int row = 0; row < h; row += 8)
for (int col = 0; col < w; col += 8)
satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
-
}
- else
+ else // w multiple of 8, h multiple of 4
{
for (int row = 0; row < h; row += 4)
for (int col = 0; col < w; col += 8)
@@ -1634,38 +1661,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_PU(64, 16);
LUMA_PU(16, 64);
- p.pu[LUMA_4x4].satd = pixel_satd_4x4_neon;
- p.pu[LUMA_8x4].satd = pixel_satd_8x4_neon;
-
+ p.pu[LUMA_4x4].satd = satd4_neon<4, 4>;
+ p.pu[LUMA_4x8].satd = satd4_neon<4, 8>;
+ p.pu[LUMA_4x16].satd = satd4_neon<4, 16>;
+ p.pu[LUMA_8x4].satd = satd8_neon<8, 4>;
p.pu[LUMA_8x8].satd = satd8_neon<8, 8>;
- p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
- p.pu[LUMA_16x8].satd = satd8_neon<16, 8>;
p.pu[LUMA_8x16].satd = satd8_neon<8, 16>;
- p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+ p.pu[LUMA_8x32].satd = satd8_neon<8, 32>;
+ p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
p.pu[LUMA_16x4].satd = satd8_neon<16, 4>;
- p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
- p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+ p.pu[LUMA_16x8].satd = satd8_neon<16, 8>;
+ p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+ p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
p.pu[LUMA_16x32].satd = satd8_neon<16, 32>;
- p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+ p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
p.pu[LUMA_24x32].satd = satd8_neon<24, 32>;
p.pu[LUMA_32x8].satd = satd8_neon<32, 8>;
- p.pu[LUMA_8x32].satd = satd8_neon<8, 32>;
- p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
- p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+ p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+ p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+ p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
p.pu[LUMA_32x64].satd = satd8_neon<32, 64>;
- p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
p.pu[LUMA_48x64].satd = satd8_neon<48, 64>;
p.pu[LUMA_64x16].satd = satd8_neon<64, 16>;
- p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
-
-#if HIGH_BIT_DEPTH
- p.pu[LUMA_4x8].satd = satd4_neon<4, 8>;
- p.pu[LUMA_4x16].satd = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
- p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+ p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+ p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
+ p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
LUMA_CU(4, 4);
@@ -1673,7 +1693,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_CU(16, 16);
LUMA_CU(32, 32);
LUMA_CU(64, 64);
-
+
#if !(HIGH_BIT_DEPTH)
p.cu[BLOCK_8x8].var = pixel_var_neon<8>;
p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
@@ -1697,17 +1717,17 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_8x8].calcresidual[ALIGNED] = getResidual_neon<8>;
p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = getResidual_neon<16>;
p.cu[BLOCK_16x16].calcresidual[ALIGNED] = getResidual_neon<16>;
-
+
#if defined(__APPLE__)
p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = getResidual_neon<32>;
p.cu[BLOCK_32x32].calcresidual[ALIGNED] = getResidual_neon<32>;
#endif // defined(__APPLE__)
- p.cu[BLOCK_4x4].sa8d = pixel_satd_4x4_neon;
- p.cu[BLOCK_8x8].sa8d = pixel_sa8d_8x8_neon;
- p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
- p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
- p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
+ p.cu[BLOCK_4x4].sa8d = satd4_neon<4, 4>;
+ p.cu[BLOCK_8x8].sa8d = sa8d8_neon<8, 8>;
+ p.cu[BLOCK_16x16].sa8d = sa8d16_neon<16, 16>;
+ p.cu[BLOCK_32x32].sa8d = sa8d16_neon<32, 32>;
+ p.cu[BLOCK_64x64].sa8d = sa8d16_neon<64, 64>;
#define CHROMA_PU_420(W, H) \
@@ -1743,38 +1763,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = pixel_satd_4x4_neon;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8_neon<8, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = pixel_satd_8x4_neon;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8_neon<16, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8_neon<8, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4_neon<16, 12>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd4_neon<16, 4>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8_neon<32, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8_neon<8, 32>;
-
-#if HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = satd4_neon<4, 4>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd4_neon<4, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = satd8_neon<8, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8_neon<8, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8_neon<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8_neon<8, 32>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd8_neon<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8_neon<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd8_neon<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8_neon<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
#define CHROMA_CU_420(W, H) \
@@ -1783,7 +1795,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
+
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1799,9 +1811,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 8>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 16>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 32>;
#define CHROMA_PU_422(W, H) \
@@ -1837,34 +1849,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8_neon<8, 16>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = pixel_satd_4x4_neon;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = satd4_neon<4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4_neon<4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4_neon<4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4_neon<4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd8_neon<8, 4>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd8_neon<8, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd8_neon<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8_neon<8, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd8_neon<8, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd4_neon<8, 4>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8_neon<16, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd4_neon<8, 12>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd8_neon<8, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8_neon<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8_neon<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8_neon<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8_neon<32, 48>;
-
-#if HIGH_BIT_DEPTH
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4_neon<4, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4_neon<4, 16>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4_neon<4, 32>;
-#endif // HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
#define CHROMA_CU_422(W, H) \
@@ -1887,10 +1896,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
CHROMA_CU_422(16, 32)
CHROMA_CU_422(32, 64)
- p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
- p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
- p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
- p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
+ p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 16>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 32>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 64>;
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = sa8d8_neon<8, 16>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = sa8d16_neon<16, 32>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = sa8d16_neon<32, 64>;
p.weight_pp = weight_pp_neon;
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 106ba903a..856c12862 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -56,261 +56,3 @@ function PFX(pixel_sub_ps_8x16_sve)
ret
endfunc
-//******* satd *******
-.macro satd_4x4_sve
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z2.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z1.h}, p0/z, [x0]
- ld1b {z3.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z6.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z5.h}, p0/z, [x0]
- ld1b {z7.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
-
- sub z0.h, z0.h, z2.h
- sub z1.h, z1.h, z3.h
- sub z2.h, z4.h, z6.h
- sub z3.h, z5.h, z7.h
-
- add z4.h, z0.h, z2.h
- add z5.h, z1.h, z3.h
- sub z6.h, z0.h, z2.h
- sub z7.h, z1.h, z3.h
-
- add z0.h, z4.h, z5.h
- sub z1.h, z4.h, z5.h
-
- add z2.h, z6.h, z7.h
- sub z3.h, z6.h, z7.h
-
- trn1 z4.h, z0.h, z2.h
- trn2 z5.h, z0.h, z2.h
-
- trn1 z6.h, z1.h, z3.h
- trn2 z7.h, z1.h, z3.h
-
- add z0.h, z4.h, z5.h
- sub z1.h, z4.h, z5.h
-
- add z2.h, z6.h, z7.h
- sub z3.h, z6.h, z7.h
-
- trn1 z4.s, z0.s, z1.s
- trn2 z5.s, z0.s, z1.s
-
- trn1 z6.s, z2.s, z3.s
- trn2 z7.s, z2.s, z3.s
-
- abs z4.h, p0/m, z4.h
- abs z5.h, p0/m, z5.h
- abs z6.h, p0/m, z6.h
- abs z7.h, p0/m, z7.h
-
- smax z4.h, p0/m, z4.h, z5.h
- smax z6.h, p0/m, z6.h, z7.h
-
- add z0.h, z4.h, z6.h
-
- uaddlp v0.2s, v0.4h
- uaddlp v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_sve)
- ptrue p0.h, vl4
- satd_4x4_sve
- fmov x0, d0
- ret
-endfunc
-
-function PFX(pixel_satd_8x4_sve)
- ptrue p0.h, vl4
- mov x4, x0
- mov x5, x2
- satd_4x4_sve
- add x0, x4, #4
- add x2, x5, #4
- umov x6, v0.d[0]
- satd_4x4_sve
- umov x0, v0.d[0]
- add x0, x0, x6
- ret
-endfunc
-
-function PFX(pixel_satd_8x12_sve)
- ptrue p0.h, vl4
- mov x4, x0
- mov x5, x2
- mov x7, #0
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
-.rept 2
- sub x0, x0, #4
- sub x2, x2, #4
- mov x4, x0
- mov x5, x2
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
-.endr
- mov x0, x7
- ret
-endfunc
-
-.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
- mov x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z1.h}, p0/z, [x0, x11]
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z5.h}, p0/z, [x0, x11]
- ld1b {z6.h}, p0/z, [x2]
- ld1b {z7.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- sub \v0\().h, z0.h, z2.h
- sub \v4\().h, z1.h, z3.h
- sub \v1\().h, z4.h, z6.h
- sub \v5\().h, z5.h, z7.h
-
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z1.h}, p0/z, [x0, x11]
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z5.h}, p0/z, [x0, x11]
- ld1b {z6.h}, p0/z, [x2]
- ld1b {z7.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- sub \v2\().h, z0.h, z2.h
- sub \v6\().h, z1.h, z3.h
- sub \v3\().h, z4.h, z6.h
- sub \v7\().h, z5.h, z7.h
-.endm
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_sve), export=0
- HADAMARD4_V z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
- HADAMARD4_V z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
- trn4 z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
- trn4 z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
- SUMSUB_ABCD z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
- SUMSUB_ABCD z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
- trn4 z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
- trn4 z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
- ABS8_SVE z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
- smax z0.h, p0/m, z0.h, z2.h
- smax z1.h, p0/m, z1.h, z3.h
- smax z4.h, p0/m, z4.h, z6.h
- smax z5.h, p0/m, z5.h, z7.h
- ret
-endfunc
-
-function PFX(satd_16x4_sve), export=0
- LOAD_DIFF_16x4_sve z16, z17, z18, z19, z20, z21, z22, z23
- b PFX(satd_8x4v_8x8h_sve)
-endfunc
-
-.macro pixel_satd_32x8_sve
- mov x4, x0
- mov x5, x2
-.rept 2
- bl PFX(satd_16x4_sve)
- add z30.h, z30.h, z0.h
- add z31.h, z31.h, z1.h
- add z30.h, z30.h, z4.h
- add z31.h, z31.h, z5.h
-.endr
- add x0, x4, #16
- add x2, x5, #16
-.rept 2
- bl PFX(satd_16x4_sve)
- add z30.h, z30.h, z0.h
- add z31.h, z31.h, z1.h
- add z30.h, z30.h, z4.h
- add z31.h, z31.h, z5.h
-.endr
-.endm
-
-.macro satd_32x16_sve
- movi v30.2d, #0
- movi v31.2d, #0
- pixel_satd_32x8_sve
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8_sve
- add z0.h, z30.h, z31.h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
-.endm
-
-function PFX(pixel_satd_32x16_sve)
- ptrue p0.h, vl8
- mov x10, x30
- satd_32x16_sve
- mov x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x32_sve)
- ptrue p0.h, vl8
- mov x10, x30
- mov x7, #0
- satd_32x16_sve
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
- satd_32x16_sve
- add x0, x7, x6
- ret x10
-endfunc
-
-.macro satd_64x16_sve
- mov x8, x0
- mov x9, x2
- satd_32x16_sve
- add x7, x7, x6
- add x0, x8, #32
- add x2, x9, #32
- satd_32x16_sve
- add x7, x7, x6
-.endm
-
-function PFX(pixel_satd_64x48_sve)
- ptrue p0.h, vl8
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_64x16_sve
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_sve
- mov x0, x7
- ret x10
-endfunc
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..e189fdcd7 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,963 +565,6 @@ function PFX(scale2D_64to32_neon)
ret
endfunc
-//******* satd *******
-.macro satd_4x4_neon
- ldr s0, [x0]
- ldr s1, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[1], [x2], x3
-
- ldr s2, [x0]
- ldr s3, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1 {v2.s}[1], [x0], x1
- ld1 {v3.s}[1], [x2], x3
-
- usubl v4.8h, v0.8b, v1.8b
- usubl v5.8h, v2.8b, v3.8b
-
- add v6.8h, v4.8h, v5.8h
- sub v7.8h, v4.8h, v5.8h
-
- mov v4.d[0], v6.d[1]
- add v0.4h, v6.4h, v4.4h
- sub v2.4h, v6.4h, v4.4h
-
- mov v5.d[0], v7.d[1]
- add v1.4h, v7.4h, v5.4h
- sub v3.4h, v7.4h, v5.4h
-
- trn1 v4.4h, v0.4h, v1.4h
- trn2 v5.4h, v0.4h, v1.4h
-
- trn1 v6.4h, v2.4h, v3.4h
- trn2 v7.4h, v2.4h, v3.4h
-
- add v0.4h, v4.4h, v5.4h
- sub v1.4h, v4.4h, v5.4h
-
- add v2.4h, v6.4h, v7.4h
- sub v3.4h, v6.4h, v7.4h
-
- trn1 v4.2s, v0.2s, v1.2s
- trn2 v5.2s, v0.2s, v1.2s
-
- trn1 v6.2s, v2.2s, v3.2s
- trn2 v7.2s, v2.2s, v3.2s
-
- abs v4.4h, v4.4h
- abs v5.4h, v5.4h
- abs v6.4h, v6.4h
- abs v7.4h, v7.4h
-
- smax v1.4h, v4.4h, v5.4h
- smax v2.4h, v6.4h, v7.4h
-
- add v0.4h, v1.4h, v2.4h
- uaddlp v0.2s, v0.4h
- uaddlp v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_neon)
- satd_4x4_neon
- fmov x0, d0
- ret
-endfunc
-
-.macro x265_satd_4x8_8x4_end_neon
- add v0.8h, v4.8h, v6.8h
- add v1.8h, v5.8h, v7.8h
- sub v2.8h, v4.8h, v6.8h
- sub v3.8h, v5.8h, v7.8h
-
- trn1 v16.8h, v0.8h, v1.8h
- trn2 v17.8h, v0.8h, v1.8h
- add v4.8h, v16.8h, v17.8h
- trn1 v18.8h, v2.8h, v3.8h
- trn2 v19.8h, v2.8h, v3.8h
- sub v5.8h, v16.8h, v17.8h
- add v6.8h, v18.8h, v19.8h
- sub v7.8h, v18.8h, v19.8h
- trn1 v0.4s, v4.4s, v6.4s
- trn2 v2.4s, v4.4s, v6.4s
- abs v0.8h, v0.8h
- trn1 v1.4s, v5.4s, v7.4s
- trn2 v3.4s, v5.4s, v7.4s
- abs v2.8h, v2.8h
- abs v1.8h, v1.8h
- abs v3.8h, v3.8h
- umax v0.8h, v0.8h, v2.8h
- umax v1.8h, v1.8h, v3.8h
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
-.endm
-
-.macro pixel_satd_4x8_neon
- ld1r {v1.2s}, [x2], x3
- ld1r {v0.2s}, [x0], x1
- ld1r {v3.2s}, [x2], x3
- ld1r {v2.2s}, [x0], x1
- ld1r {v5.2s}, [x2], x3
- ld1r {v4.2s}, [x0], x1
- ld1r {v7.2s}, [x2], x3
- ld1r {v6.2s}, [x0], x1
-
- ld1 {v1.s}[1], [x2], x3
- ld1 {v0.s}[1], [x0], x1
- usubl v0.8h, v0.8b, v1.8b
- ld1 {v3.s}[1], [x2], x3
- ld1 {v2.s}[1], [x0], x1
- usubl v1.8h, v2.8b, v3.8b
- ld1 {v5.s}[1], [x2], x3
- ld1 {v4.s}[1], [x0], x1
- usubl v2.8h, v4.8b, v5.8b
- ld1 {v7.s}[1], [x2], x3
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
- ld1 {v6.s}[1], [x0], x1
- usubl v3.8h, v6.8b, v7.8b
- add v6.8h, v2.8h, v3.8h
- sub v7.8h, v2.8h, v3.8h
- x265_satd_4x8_8x4_end_neon
-.endm
-
-// template<int w, int h>
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x8_neon)
- pixel_satd_4x8_neon
- mov w0, v0.s[0]
- ret
-endfunc
-
-function PFX(pixel_satd_4x16_neon)
- mov w4, #0
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w4, w4, w5
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w0, w5, w4
- ret
-endfunc
-
-function PFX(pixel_satd_4x32_neon)
- mov w4, #0
-.rept 4
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w4, w4, w5
-.endr
- mov w0, w4
- ret
-endfunc
-
-function PFX(pixel_satd_12x16_neon)
- mov x4, x0
- mov x5, x2
- mov w7, #0
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-
- add x0, x4, #4
- add x2, x5, #4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-
- add x0, x4, #8
- add x2, x5, #8
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w0, w7, w6
- ret
-endfunc
-
-function PFX(pixel_satd_12x32_neon)
- mov x4, x0
- mov x5, x2
- mov w7, #0
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- add x0, x4, #4
- add x2, x5, #4
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- add x0, x4, #8
- add x2, x5, #8
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- mov w0, w7
- ret
-endfunc
-
-function PFX(pixel_satd_8x4_neon)
- mov x4, x0
- mov x5, x2
- satd_4x4_neon
- add x0, x4, #4
- add x2, x5, #4
- umov x6, v0.d[0]
- satd_4x4_neon
- umov x0, v0.d[0]
- add x0, x0, x6
- ret
-endfunc
-
-.macro LOAD_DIFF_8x4 v0 v1 v2 v3
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x2], x3
- ld1 {v2.8b}, [x0], x1
- ld1 {v3.8b}, [x2], x3
- ld1 {v4.8b}, [x0], x1
- ld1 {v5.8b}, [x2], x3
- ld1 {v6.8b}, [x0], x1
- ld1 {v7.8b}, [x2], x3
- usubl \v0, v0.8b, v1.8b
- usubl \v1, v2.8b, v3.8b
- usubl \v2, v4.8b, v5.8b
- usubl \v3, v6.8b, v7.8b
-.endm
-
-.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
- ld1 {v0.16b}, [x0], x1
- ld1 {v1.16b}, [x2], x3
- ld1 {v2.16b}, [x0], x1
- ld1 {v3.16b}, [x2], x3
- ld1 {v4.16b}, [x0], x1
- ld1 {v5.16b}, [x2], x3
- ld1 {v6.16b}, [x0], x1
- ld1 {v7.16b}, [x2], x3
- usubl \v0, v0.8b, v1.8b
- usubl \v1, v2.8b, v3.8b
- usubl \v2, v4.8b, v5.8b
- usubl \v3, v6.8b, v7.8b
- usubl2 \v4, v0.16b, v1.16b
- usubl2 \v5, v2.16b, v3.16b
- usubl2 \v6, v4.16b, v5.16b
- usubl2 \v7, v6.16b, v7.16b
-.endm
-
-function PFX(satd_16x4_neon), export=0
- LOAD_DIFF_16x4 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
- b PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-function PFX(satd_8x8_neon), export=0
- LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
- LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
- b PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_neon), export=0
- HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
- HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
- trn4 v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
- trn4 v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
- SUMSUB_ABCD v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
- SUMSUB_ABCD v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
- trn4 v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
- trn4 v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
- ABS8 v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
- smax v0.8h, v0.8h, v2.8h
- smax v1.8h, v1.8h, v3.8h
- smax v2.8h, v4.8h, v6.8h
- smax v3.8h, v5.8h, v7.8h
- ret
-endfunc
-
-function PFX(pixel_satd_8x8_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- add v1.8h, v2.8h, v3.8h
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x12_neon)
- mov x4, x0
- mov x5, x2
- mov x7, #0
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
-.rept 2
- sub x0, x0, #4
- sub x2, x2, #4
- mov x4, x0
- mov x5, x2
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
-.endr
- mov x0, x7
- ret
-endfunc
-
-function PFX(pixel_satd_8x16_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x32_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 3
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x64_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 7
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x4_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x8_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x12_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x16_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 3
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x24_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 5
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-.macro pixel_satd_16x32_neon
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 7
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-function PFX(pixel_satd_16x32_neon)
- mov x10, x30
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x64_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 15
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_24x32_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_24x64_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- sub x4, x4, #24
- sub x5, x5, #24
- add x0, x4, x1, lsl #5
- add x2, x5, x3, lsl #5
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- mov x0, x7
- ret x10
-endfunc
-
-.macro pixel_satd_32x8
- mov x4, x0
- mov x5, x2
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add x0, x4, #16
- add x2, x5, #16
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-.macro satd_32x16_neon
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_32x8
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
-.endm
-
-.macro satd_64x16_neon
- mov x8, x0
- mov x9, x2
- satd_32x16_neon
- add x7, x7, x6
- add x0, x8, #32
- add x2, x9, #32
- satd_32x16_neon
- add x7, x7, x6
-.endm
-
-function PFX(pixel_satd_32x8_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x16_neon)
- mov x10, x30
- satd_32x16_neon
- mov x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x24_neon)
- mov x10, x30
- satd_32x16_neon
- movi v30.8h, #0
- movi v31.8h, #0
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- add x0, x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x32_neon)
- mov x10, x30
- mov x7, #0
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x48_neon)
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x64_neon)
- mov x10, x30
- mov x7, #0
-.rept 3
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x16_neon)
- mov x10, x30
- mov x7, #0
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x32_neon)
- mov x10, x30
- mov x7, #0
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x48_neon)
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x64_neon)
- mov x10, x30
- mov x7, #0
-.rept 3
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_48x64_neon)
- mov x10, x30
- mov x7, #0
- mov x8, x0
- mov x9, x2
-.rept 3
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x7, x7, x6
-
- add x0, x8, #32
- add x2, x9, #32
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
-
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(sa8d_8x8_neon), export=0
- LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
- LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
- HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
- HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
- SUMSUB_ABCD v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
- SUMSUB_ABCD v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
- trn4 v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
- trn4 v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
- SUMSUB_ABCD v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
- SUMSUB_ABCD v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
- trn4 v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
- trn4 v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
- SUMSUB_ABCD v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
- SUMSUB_ABCD v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
- trn4 v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
- trn4 v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
- ABS8 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
- smax v16.8h, v16.8h, v20.8h
- smax v17.8h, v17.8h, v21.8h
- smax v18.8h, v18.8h, v22.8h
- smax v19.8h, v19.8h, v23.8h
- add v0.8h, v16.8h, v17.8h
- add v1.8h, v18.8h, v19.8h
- ret
-endfunc
-
-function PFX(pixel_sa8d_8x8_neon)
- mov x10, x30
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- add w0, w0, #1
- lsr w0, w0, #1
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_8x16_neon)
- mov x10, x30
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w5, v0.s[0]
- add w5, w5, #1
- lsr w5, w5, #1
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w4, v0.s[0]
- add w4, w4, #1
- lsr w4, w4, #1
- add w0, w4, w5
- ret x10
-endfunc
-
-.macro sa8d_16x16 reg
- bl PFX(sa8d_8x8_neon)
- uaddlp v30.4s, v0.8h
- uaddlp v31.4s, v1.8h
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- add v0.4s, v30.4s, v31.4s
- addv s0, v0.4s
- mov \reg, v0.s[0]
- add \reg, \reg, #1
- lsr \reg, \reg, #1
-.endm
-
-function PFX(pixel_sa8d_16x16_neon)
- mov x10, x30
- sa8d_16x16 w0
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_16x32_neon)
- mov x10, x30
- sa8d_16x16 w4
- sub x0, x0, #8
- sub x2, x2, #8
- sa8d_16x16 w5
- add w0, w4, w5
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_32x32_neon)
- mov x10, x30
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- sub x0, x0, #24
- sub x2, x2, #24
- sa8d_16x16 w6
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w7
- add w4, w4, w5
- add w6, w6, w7
- add w0, w4, w6
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_32x64_neon)
- mov x10, x30
- mov w11, #4
- mov w9, #0
-.Loop_sa8d_32:
- sub w11, w11, #1
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- add w4, w4, w5
- add w9, w9, w4
- sub x0, x0, #24
- sub x2, x2, #24
- cbnz w11, .Loop_sa8d_32
- mov w0, w9
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_64x64_neon)
- mov x10, x30
- mov w11, #4
- mov w9, #0
-.Loop_sa8d_64:
- sub w11, w11, #1
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w6
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w7
- add w4, w4, w5
- add w6, w6, w7
- add w8, w4, w6
- add w9, w9, w8
-
- sub x0, x0, #56
- sub x2, x2, #56
- cbnz w11, .Loop_sa8d_64
- mov w0, w9
- ret x10
-endfunc
-
/***** dequant_scaling*****/
// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
function PFX(dequant_scaling_neon)
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From a819aca07e207879224481058d8eb7a41a07b300 Mon Sep 17 00:00:00 2001
Message-Id: <a819aca07e207879224481058d8eb7a41a07b300.1746034801.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1746034801.git.li.zhang2 at arm.com>
References: <cover.1746034801.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Wed, 30 Apr 2025 19:29:01 +0200
Subject: [PATCH 1/3] AArch64: Clean up satd/sa8d functions
Clean up and optimize the Neon intrinsics implementation of the
satd/sa8d primitives for all bitdepths.
Remove the Neon and SVE assembly implementations of these primitives
since they are now slower than the Neon intrinsics implementations.
---
source/common/aarch64/asm-primitives.cpp | 76 --
source/common/aarch64/mem-neon.h | 6 +-
source/common/aarch64/pixel-prim.cpp | 1407 +++++++++++-----------
source/common/aarch64/pixel-util-sve.S | 258 ----
source/common/aarch64/pixel-util.S | 957 ---------------
5 files changed, 712 insertions(+), 1992 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6097f7655..4d2c575d1 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -652,64 +652,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
- // satd
- ALL_LUMA_PU(satd, pixel_satd, neon);
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon);
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon);
-
- // sa8d
- p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon);
- p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
- p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_neon);
@@ -857,24 +799,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
#if !HIGH_BIT_DEPTH
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
-
- // satd
- p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
- p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
- p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_sve);
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_sve);
-
- // sa8d
- p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
#else // HIGH_BIT_DEPTH
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 263c1d569..8bd5fbee9 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -106,8 +106,7 @@ static void inline load_u8x16xn(const uint8_t *src, const intptr_t stride,
{
for (int i = 0; i < N; ++i)
{
- dst[i] = vld1q_u8(src);
- src += stride;
+ dst[i] = vld1q_u8(src + i * stride);
}
}
@@ -230,8 +229,7 @@ static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
{
for (int i = 0; i < N; ++i)
{
- dst[i] = vld1q_u16(src);
- src += stride;
+ dst[i] = vld1q_u16(src + i * stride);
}
}
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 15ccdff22..67c388b59 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -19,799 +19,805 @@ namespace
{
-/* SATD SA8D variants - based on x264 */
-static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a, const int16x8_t b)
+static inline void sumsubq_s16(int16x8_t *sum, int16x8_t *sub, const int16x8_t a, const int16x8_t b)
{
- sum = vaddq_s16(a, b);
- sub = vsubq_s16(a, b);
+ *sum = vaddq_s16(a, b);
+ *sub = vsubq_s16(a, b);
}
-static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s16x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
- t1 = vtrn1q_s16(s1, s2);
- t2 = vtrn2q_s16(s1, s2);
+ *t1 = vtrn1q_s16(s1, s2);
+ *t2 = vtrn2q_s16(s1, s2);
}
-static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s32x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
- t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
- t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
+ *t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+ *t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
}
-static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
- const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_s16_s64x2(int16x8_t *t1, int16x8_t *t2,
+ const int16x8_t s1, const int16x8_t s2)
{
int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
- t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
- t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
+ *t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+ *t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
}
-static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
- int16x8_t a, int16x8_t b, int16x8_t c, int16x8_t d)
+static inline uint16x8_t max_abs_s16(const int16x8_t a, const int16x8_t b)
{
- SUMSUB_AB(s1, d1, a, b);
- SUMSUB_AB(s2, d2, c, d);
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(a));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(b));
+
+ return vmaxq_u16(abs0, abs1);
}
-static inline void HADAMARD4_V(int16x8_t &r1, int16x8_t &r2, int16x8_t &r3, int16x8_t &r4,
- int16x8_t &t1, int16x8_t &t2, int16x8_t &t3, int16x8_t &t4)
+#if X265_DEPTH == 12
+static inline void sumsubq_s32(int32x4_t *sum, int32x4_t *sub, const int32x4_t a, const int32x4_t b)
{
- SUMSUB_ABCD(t1, t2, t3, t4, r1, r2, r3, r4);
- SUMSUB_ABCD(r1, r3, r2, r4, t1, t3, t2, t4);
+ *sum = vaddq_s32(a, b);
+ *sub = vsubq_s32(a, b);
}
-
-static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int16x8_t v3)
-
+static inline void sumsublq_s16(int32x4_t *sum_lo, int32x4_t *sum_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi,
+ const int16x8_t a, const int16x8_t b)
{
+ *sum_lo = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+ *sub_lo = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+ *sum_hi = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+ *sub_hi = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+}
- int16x8_t v4, v5, v6, v7, v16, v17, v18, v19;
-
-
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
-
- SUMSUB_AB(v4 , v6 , v16, v18);
- SUMSUB_AB(v5 , v7 , v17, v19);
-
- transpose_8h_8h(v0, v1, v4, v5);
- transpose_8h_8h(v2, v3, v6, v7);
+static inline void transpose_inplace_s32_s64x2(int32x4_t *t1, int32x4_t *t2)
+{
+ int64x2_t tmp1 = vreinterpretq_s64_s32(*t1);
+ int64x2_t tmp2 = vreinterpretq_s64_s32(*t2);
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
+ *t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+ *t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+}
- transpose_4s_8h(v0, v1, v16, v18);
- transpose_4s_8h(v2, v3, v17, v19);
+static inline uint32x4_t max_abs_s32(int32x4_t a, int32x4_t b)
+{
+ uint32x4_t abs0 = vreinterpretq_u32_s32(vabsq_s32(a));
+ uint32x4_t abs1 = vreinterpretq_u32_s32(vabsq_s32(b));
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+ return vmaxq_u32(abs0, abs1);
+}
- uint16x8_t max0 = vmaxq_u16(abs0, abs1);
- uint16x8_t max1 = vmaxq_u16(abs2, abs3);
+#endif // X265_DEPTH == 12
- uint16x8_t sum = vaddq_u16(max0, max1);
- return vaddlvq_u16(sum);
+#if HIGH_BIT_DEPTH
+static inline void load_diff_u16x8x4(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+ uint16x8_t r[4], t[4];
+ load_u16x8xn<4>(pix1, stride_pix1, r);
+ load_u16x8xn<4>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
}
-static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
+static inline void load_diff_u16x8x4_dual(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
{
- int16x8_t v2, v3;
- SUMSUB_AB(v2, v3, v0, v1);
-
- transpose_2d_8h(v0, v1, v2, v3);
- SUMSUB_AB(v2, v3, v0, v1);
-
- transpose_8h_8h(v0, v1, v2, v3);
- SUMSUB_AB(v2, v3, v0, v1);
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
+}
- transpose_4s_8h(v0, v1, v2, v3);
+static inline void load_diff_u16x8x8(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+ uint16x8_t r[8], t[8];
+ load_u16x8xn<8>(pix1, stride_pix1, r);
+ load_u16x8xn<8>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r[3], t[3]));
+ diff[4] = vreinterpretq_s16_u16(vsubq_u16(r[4], t[4]));
+ diff[5] = vreinterpretq_s16_u16(vsubq_u16(r[5], t[5]));
+ diff[6] = vreinterpretq_s16_u16(vsubq_u16(r[6], t[6]));
+ diff[7] = vreinterpretq_s16_u16(vsubq_u16(r[7], t[7]));
+}
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t max = vmaxq_u16(abs0, abs1);
+#else // !HIGH_BIT_DEPTH
+static inline void load_diff_u8x8x4(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[4])
+{
+ uint8x8_t r[4], t[4];
+ load_u8x8xn<4>(pix1, stride_pix1, r);
+ load_u8x8xn<4>(pix2, stride_pix2, t);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(r[0], t[0]));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(r[1], t[1]));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(r[2], t[2]));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(r[3], t[3]));
+}
- return vaddlvq_u16(max);
+static inline void load_diff_u8x8x8(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
+{
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff + 4);
}
-static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
- int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline void load_diff_u8x16x4(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2, int16x8_t diff[8])
{
- int16x8_t v16, v17, v18, v19, v4, v5, v6, v7;
+ uint8x16_t s1[4], s2[4];
+ load_u8x16xn<4>(pix1, stride_pix1, s1);
+ load_u8x16xn<4>(pix2, stride_pix2, s2);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[0]), vget_low_u8(s2[0])));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[1]), vget_low_u8(s2[1])));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[2]), vget_low_u8(s2[2])));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s1[3]), vget_low_u8(s2[3])));
+ diff[4] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[0]), vget_high_u8(s2[0])));
+ diff[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[1]), vget_high_u8(s2[1])));
+ diff[6] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[2]), vget_high_u8(s2[2])));
+ diff[7] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s1[3]), vget_high_u8(s2[3])));
+}
- SUMSUB_AB(v16, v18, v0, v2);
- SUMSUB_AB(v17, v19, v1, v3);
+#endif // HIGH_BIT_DEPTH
- HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
+// 4 way hadamard vertical pass.
+static inline void hadamard_4_v(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
+{
+ int16x8_t s0, s1, d0, d1;
- transpose_8h_8h(v0, v1, v16, v17);
- transpose_8h_8h(v2, v3, v18, v19);
- transpose_8h_8h(v4, v5, v20, v21);
- transpose_8h_8h(v6, v7, v22, v23);
+ sumsubq_s16(&s0, &d0, in_coefs[0], in_coefs[1]);
+ sumsubq_s16(&s1, &d1, in_coefs[2], in_coefs[3]);
- SUMSUB_AB(v16, v17, v0, v1);
- SUMSUB_AB(v18, v19, v2, v3);
- SUMSUB_AB(v20, v21, v4, v5);
- SUMSUB_AB(v22, v23, v6, v7);
+ sumsubq_s16(&out_coefs[0], &out_coefs[2], s0, s1);
+ sumsubq_s16(&out_coefs[1], &out_coefs[3], d0, d1);
+}
- transpose_4s_8h(v0, v2, v16, v18);
- transpose_4s_8h(v1, v3, v17, v19);
- transpose_4s_8h(v4, v6, v20, v22);
- transpose_4s_8h(v5, v7, v21, v23);
+// 8 way hadamard vertical pass.
+static inline void hadamard_8_v(const int16x8_t in_coefs[8], int16x8_t out_coefs[8])
+{
+ int16x8_t temp[8];
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
- uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
- uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
- uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
- uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+ hadamard_4_v(in_coefs, temp);
+ hadamard_4_v(in_coefs + 4, temp + 4);
- v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
- v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
- v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
- v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
+ sumsubq_s16(&out_coefs[0], &out_coefs[4], temp[0], temp[4]);
+ sumsubq_s16(&out_coefs[1], &out_coefs[5], temp[1], temp[5]);
+ sumsubq_s16(&out_coefs[2], &out_coefs[6], temp[2], temp[6]);
+ sumsubq_s16(&out_coefs[3], &out_coefs[7], temp[3], temp[7]);
}
-#if HIGH_BIT_DEPTH
-
-#if (X265_DEPTH > 10)
-static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
- const int32x4_t s1, const int32x4_t s2)
+// 4 way hadamard horizontal pass.
+static inline void hadamard_4_h(const int16x8_t in_coefs[4], int16x8_t out_coefs[4])
{
- int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
- int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+ int16x8_t s0, s1, d0, d1, t0, t1, t2, t3;
+
+ transpose_s16_s16x2(&t0, &t1, in_coefs[0], in_coefs[1]);
+ transpose_s16_s16x2(&t2, &t3, in_coefs[2], in_coefs[3]);
+
+ sumsubq_s16(&s0, &d0, t0, t1);
+ sumsubq_s16(&s1, &d1, t2, t3);
- t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
- t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
+ transpose_s16_s32x2(&out_coefs[0], &out_coefs[1], s0, s1);
+ transpose_s16_s32x2(&out_coefs[2], &out_coefs[3], d0, d1);
}
-static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
+#if X265_DEPTH != 12
+// 8 way hadamard horizontal pass.
+static inline void hadamard_8_h(int16x8_t coefs[8], uint16x8_t out[4])
{
- sum = vaddq_s32(a, b);
- sub = vsubq_s32(a, b);
+ int16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ int16x8_t temp[8];
+
+ hadamard_4_h(coefs, temp);
+ hadamard_4_h(coefs + 4, temp + 4);
+
+ sumsubq_s16(&s0, &d0, temp[0], temp[1]);
+ sumsubq_s16(&s1, &d1, temp[2], temp[3]);
+ sumsubq_s16(&s2, &d2, temp[4], temp[5]);
+ sumsubq_s16(&s3, &d3, temp[6], temp[7]);
+
+ transpose_s16_s64x2(&temp[0], &temp[1], s0, s2);
+ transpose_s16_s64x2(&temp[2], &temp[3], s1, s3);
+ transpose_s16_s64x2(&temp[4], &temp[5], d0, d2);
+ transpose_s16_s64x2(&temp[6], &temp[7], d1, d3);
+
+ out[0] = max_abs_s16(temp[0], temp[1]);
+ out[1] = max_abs_s16(temp[2], temp[3]);
+ out[2] = max_abs_s16(temp[4], temp[5]);
+ out[3] = max_abs_s16(temp[6], temp[7]);
}
-static inline void ISUMSUB_AB_FROM_INT16(int32x4_t &suml, int32x4_t &sumh, int32x4_t &subl, int32x4_t &subh,
- const int16x8_t a, const int16x8_t b)
+#else // X265_DEPTH == 12
+static inline void hadamard_8_h(int16x8_t coefs[8], uint32x4_t out[4])
{
- suml = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
- sumh = vaddl_high_s16(a, b);
- subl = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
- subh = vsubl_high_s16(a, b);
+ int16x8_t a[8];
+
+ transpose_s16_s16x2(&a[0], &a[1], coefs[0], coefs[1]);
+ transpose_s16_s16x2(&a[2], &a[3], coefs[2], coefs[3]);
+ transpose_s16_s16x2(&a[4], &a[5], coefs[4], coefs[5]);
+ transpose_s16_s16x2(&a[6], &a[7], coefs[6], coefs[7]);
+
+ int32x4_t a_lo[8], a_hi[8], b_lo[8], b_hi[8];
+
+ sumsublq_s16(&a_lo[0], &a_hi[0], &a_lo[4], &a_hi[4], a[0], a[1]);
+ sumsublq_s16(&a_lo[1], &a_hi[1], &a_lo[5], &a_hi[5], a[2], a[3]);
+ sumsublq_s16(&a_lo[2], &a_hi[2], &a_lo[6], &a_hi[6], a[4], a[5]);
+ sumsublq_s16(&a_lo[3], &a_hi[3], &a_lo[7], &a_hi[7], a[6], a[7]);
+
+ transpose_inplace_s32_s64x2(&a_lo[0], &a_lo[1]);
+ transpose_inplace_s32_s64x2(&a_lo[2], &a_lo[3]);
+ transpose_inplace_s32_s64x2(&a_lo[4], &a_lo[5]);
+ transpose_inplace_s32_s64x2(&a_lo[6], &a_lo[7]);
+
+ transpose_inplace_s32_s64x2(&a_hi[0], &a_hi[1]);
+ transpose_inplace_s32_s64x2(&a_hi[2], &a_hi[3]);
+ transpose_inplace_s32_s64x2(&a_hi[4], &a_hi[5]);
+ transpose_inplace_s32_s64x2(&a_hi[6], &a_hi[7]);
+
+ sumsubq_s32(&b_lo[0], &b_lo[1], a_lo[0], a_lo[1]);
+ sumsubq_s32(&b_lo[2], &b_lo[3], a_lo[2], a_lo[3]);
+ sumsubq_s32(&b_lo[4], &b_lo[5], a_lo[4], a_lo[5]);
+ sumsubq_s32(&b_lo[6], &b_lo[7], a_lo[6], a_lo[7]);
+
+ sumsubq_s32(&b_hi[0], &b_hi[1], a_hi[0], a_hi[1]);
+ sumsubq_s32(&b_hi[2], &b_hi[3], a_hi[2], a_hi[3]);
+ sumsubq_s32(&b_hi[4], &b_hi[5], a_hi[4], a_hi[5]);
+ sumsubq_s32(&b_hi[6], &b_hi[7], a_hi[6], a_hi[7]);
+
+ uint32x4_t max0_lo = max_abs_s32(b_lo[0], b_hi[0]);
+ uint32x4_t max1_lo = max_abs_s32(b_lo[1], b_hi[1]);
+ uint32x4_t max2_lo = max_abs_s32(b_lo[2], b_hi[2]);
+ uint32x4_t max3_lo = max_abs_s32(b_lo[3], b_hi[3]);
+ uint32x4_t max0_hi = max_abs_s32(b_lo[4], b_hi[4]);
+ uint32x4_t max1_hi = max_abs_s32(b_lo[5], b_hi[5]);
+ uint32x4_t max2_hi = max_abs_s32(b_lo[6], b_hi[6]);
+ uint32x4_t max3_hi = max_abs_s32(b_lo[7], b_hi[7]);
+
+ out[0] = vaddq_u32(max0_lo, max0_hi);
+ out[1] = vaddq_u32(max1_lo, max1_hi);
+ out[2] = vaddq_u32(max2_lo, max2_hi);
+ out[3] = vaddq_u32(max3_lo, max3_hi);
}
-#endif
+#endif // X265_DEPTH != 12
-static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
- int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int hadamard_4x4(int16x8_t a0, int16x8_t a1)
{
- uint16x8_t r0, r1, r2, r3;
- uint16x8_t t0, t1, t2, t3;
- int16x8_t v16, v17;
- int16x8_t v18, v19;
-
- r0 = vld1q_u16(pix1 + 0 * stride_pix1);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1);
-
- t0 = vld1q_u16(pix2 + 0 * stride_pix2);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2);
-
- v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ int16x8_t sum, dif, t0, t1;
+ sumsubq_s16(&sum, &dif, a0, a1);
- r0 = vld1q_u16(pix1 + 4 * stride_pix1);
- r1 = vld1q_u16(pix1 + 5 * stride_pix1);
- r2 = vld1q_u16(pix1 + 6 * stride_pix1);
- r3 = vld1q_u16(pix1 + 7 * stride_pix1);
+ transpose_s16_s64x2(&t0, &t1, sum, dif);
+ sumsubq_s16(&sum, &dif, t0, t1);
- t0 = vld1q_u16(pix2 + 4 * stride_pix2);
- t1 = vld1q_u16(pix2 + 5 * stride_pix2);
- t2 = vld1q_u16(pix2 + 6 * stride_pix2);
- t3 = vld1q_u16(pix2 + 7 * stride_pix2);
+ transpose_s16_s16x2(&t0, &t1, sum, dif);
+ sumsubq_s16(&sum, &dif, t0, t1);
- v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ transpose_s16_s32x2(&t0, &t1, sum, dif);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ uint16x8_t max = max_abs_s16(t0, t1);
+ return vaddlvq_u16(max);
}
-
-
-
-static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+// Calculate 2 4x4 hadamard transformation.
+static void hadamard_4x4_dual(int16x8_t diff[4], uint16x8_t *out)
{
- uint16x8_t r0, r1, r2, r3;
- uint16x8_t t0, t1, t2, t3;
- int16x8_t v16, v17, v20, v21;
- int16x8_t v18, v19, v22, v23;
+ int16x8_t temp[4];
- r0 = vld1q_u16(pix1 + 0 * stride_pix1);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1);
+ hadamard_4_v(diff, temp);
+ hadamard_4_h(temp, diff);
- t0 = vld1q_u16(pix2 + 0 * stride_pix2);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2);
+ uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+ uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
- v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
-
- r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8);
- r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8);
- r2 = vld1q_u16(pix1 + 2 * stride_pix1 + 8);
- r3 = vld1q_u16(pix1 + 3 * stride_pix1 + 8);
+ *out = vaddq_u16(sum0, sum1);
+}
- t0 = vld1q_u16(pix2 + 0 * stride_pix2 + 8);
- t1 = vld1q_u16(pix2 + 1 * stride_pix2 + 8);
- t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8);
- t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8);
+// Calculate 4 4x4 hadamard transformation.
+static inline void hadamard_4x4_quad(int16x8_t diff[8], uint16x8_t out[2])
+{
+ int16x8_t temp[8];
- v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
+ hadamard_4_v(diff, temp);
+ hadamard_4_v(diff + 4, temp + 4);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ hadamard_4_h(temp, diff);
+ hadamard_4_h(temp + 4, diff + 4);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+ uint16x8_t sum0 = max_abs_s16(diff[0], diff[1]);
+ uint16x8_t sum1 = max_abs_s16(diff[2], diff[3]);
+ uint16x8_t sum2 = max_abs_s16(diff[4], diff[5]);
+ uint16x8_t sum3 = max_abs_s16(diff[6], diff[7]);
+ out[0] = vaddq_u16(sum0, sum1);
+ out[1] = vaddq_u16(sum2, sum3);
}
-
-int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if X265_DEPTH == 8
+static inline void hadamard_8x8(int16x8_t diff[8], uint16x8_t out[2])
{
- uint16x4_t t0_0 = vld1_u16(pix1 + 0 * stride_pix1);
- uint16x4_t t1_0 = vld1_u16(pix1 + 1 * stride_pix1);
- uint16x4_t t0_1 = vld1_u16(pix1 + 2 * stride_pix1);
- uint16x4_t t1_1 = vld1_u16(pix1 + 3 * stride_pix1);
- uint16x8_t t0 = vcombine_u16(t0_0, t0_1);
- uint16x8_t t1 = vcombine_u16(t1_0, t1_1);
+ int16x8_t temp[8];
+ uint16x8_t sum[4];
- uint16x4_t r0_0 = vld1_u16(pix2 + 0 * stride_pix2);
- uint16x4_t r1_0 = vld1_u16(pix2 + 1 * stride_pix2);
- uint16x4_t r0_1 = vld1_u16(pix2 + 2 * stride_pix2);
- uint16x4_t r1_1 = vld1_u16(pix2 + 3 * stride_pix2);
- uint16x8_t r0 = vcombine_u16(r0_0, r0_1);
- uint16x8_t r1 = vcombine_u16(r1_0, r1_1);
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
- int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
-
- return _satd_4x4_neon(v0, v1);
+ out[0] = vaddq_u16(sum[0], sum[1]);
+ out[1] = vaddq_u16(sum[2], sum[3]);
}
+#elif X265_DEPTH == 10
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
+{
+ int16x8_t temp[8];
+ uint16x8_t sum[4];
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
+ out[0] = vpaddlq_u16(sum[0]);
+ out[1] = vpaddlq_u16(sum[1]);
+ out[0] = vpadalq_u16(out[0], sum[2]);
+ out[1] = vpadalq_u16(out[1], sum[3]);
+}
-
-
-int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#elif X265_DEPTH == 12
+static inline void hadamard_8x8(int16x8_t diff[8], uint32x4_t out[2])
{
- uint16x8_t i0, i1, i2, i3, i4, i5, i6, i7;
-
- i0 = vld1q_u16(pix1 + 0 * stride_pix1);
- i1 = vld1q_u16(pix2 + 0 * stride_pix2);
- i2 = vld1q_u16(pix1 + 1 * stride_pix1);
- i3 = vld1q_u16(pix2 + 1 * stride_pix2);
- i4 = vld1q_u16(pix1 + 2 * stride_pix1);
- i5 = vld1q_u16(pix2 + 2 * stride_pix2);
- i6 = vld1q_u16(pix1 + 3 * stride_pix1);
- i7 = vld1q_u16(pix2 + 3 * stride_pix2);
+ int16x8_t temp[8];
+ uint32x4_t sum[4];
- int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3));
- int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5));
- int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7));
+ hadamard_8_v(diff, temp);
+ hadamard_8_h(temp, sum);
- return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+ out[0] = vaddq_u32(sum[0], sum[1]);
+ out[1] = vaddq_u32(sum[2], sum[3]);
}
+#endif // X265_DEPTH == 8
-int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
+#if HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
- int16x8_t v0, v1, v2, v3;
+ uint16x4_t s[4], r[4];
+ load_u16x4xn<4>(pix1, stride_pix1, s);
+ load_u16x4xn<4>(pix2, stride_pix2, r);
- for (int offset = 0; offset <= 12; offset += 4)
- {
- _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1,
- pix2 + offset * stride_pix2,stride_pix2,
- v0, v1, v2, v3);
- v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0));
- v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1));
- v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2));
- v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3));
- }
+ uint16x8_t s0 = vcombine_u16(s[0], s[2]);
+ uint16x8_t s1 = vcombine_u16(s[1], s[3]);
+ uint16x8_t r0 = vcombine_u16(r[0], r[2]);
+ uint16x8_t r1 = vcombine_u16(r[1], r[3]);
+
+ int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
- return vaddvq_u32(vaddq_u32(v30, v31));
+ return hadamard_4x4(diff0, diff1);
}
-#else //HIGH_BIT_DEPTH
+static inline int pixel_satd_4x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[4];
+
+ uint16x4_t s[8], r[8];
+ load_u16x4xn<8>(pix1, stride_pix1, s);
+ load_u16x4xn<8>(pix2, stride_pix2, r);
+
+ uint16x8_t s0 = vcombine_u16(s[0], s[4]);
+ uint16x8_t s1 = vcombine_u16(s[1], s[5]);
+ uint16x8_t s2 = vcombine_u16(s[2], s[6]);
+ uint16x8_t s3 = vcombine_u16(s[3], s[7]);
+ uint16x8_t r0 = vcombine_u16(r[0], r[4]);
+ uint16x8_t r1 = vcombine_u16(r[1], r[5]);
+ uint16x8_t r2 = vcombine_u16(r[2], r[6]);
+ uint16x8_t r3 = vcombine_u16(r[3], r[7]);
+
+ diff[0] = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ diff[1] = vreinterpretq_s16_u16(vsubq_u16(r1, s1));
+ diff[2] = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+ diff[3] = vreinterpretq_s16_u16(vsubq_u16(r3, s3));
+
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+
+ return vaddlvq_u16(out);
+}
-static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x16_t r0, r1, r2, r3;
- uint8x16_t t0, t1, t2, t3;
- int16x8_t v16, v17, v20, v21;
- int16x8_t v18, v19, v22, v23;
+ int16x8_t diff[4];
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
- r0 = vld1q_u8(pix1 + 0 * stride_pix1);
- r1 = vld1q_u8(pix1 + 1 * stride_pix1);
- r2 = vld1q_u8(pix1 + 2 * stride_pix1);
- r3 = vld1q_u8(pix1 + 3 * stride_pix1);
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
- t0 = vld1q_u8(pix2 + 0 * stride_pix2);
- t1 = vld1q_u8(pix2 + 1 * stride_pix2);
- t2 = vld1q_u8(pix2 + 2 * stride_pix2);
- t3 = vld1q_u8(pix2 + 3 * stride_pix2);
+ return vaddlvq_u16(out);
+}
- v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)));
- v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)));
- v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)));
- v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)));
- v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3));
+static inline int pixel_satd_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+ uint32x4_t res = vpaddlq_u16(out[0]);
+ res = vpadalq_u16(res, out[1]);
+ return vaddvq_u32(res);
}
-
-static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3,
- int16x8_t &v20, int16x8_t &v21, int16x8_t &v22, int16x8_t &v23)
+static inline int pixel_satd_8x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x8_t r0, r1, r2, r3;
- uint8x8_t t0, t1, t2, t3;
- int16x8_t v16, v17;
- int16x8_t v18, v19;
+ int16x8_t diff[16];
+ uint16x8_t out[4];
+
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4_dual(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
- r0 = vld1_u8(pix1 + 0 * stride_pix1);
- r1 = vld1_u8(pix1 + 1 * stride_pix1);
- r2 = vld1_u8(pix1 + 2 * stride_pix1);
- r3 = vld1_u8(pix1 + 3 * stride_pix1);
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
- t0 = vld1_u8(pix2 + 0 * stride_pix2);
- t1 = vld1_u8(pix2 + 1 * stride_pix2);
- t2 = vld1_u8(pix2 + 2 * stride_pix2);
- t3 = vld1_u8(pix2 + 3 * stride_pix2);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
- v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
- v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
- v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
- v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+ uint32x4_t res = vpaddlq_u16(sum0);
+ res = vpadalq_u16(res, sum1);
- r0 = vld1_u8(pix1 + 4 * stride_pix1);
- r1 = vld1_u8(pix1 + 5 * stride_pix1);
- r2 = vld1_u8(pix1 + 6 * stride_pix1);
- r3 = vld1_u8(pix1 + 7 * stride_pix1);
+ return vaddvq_u32(res);
+}
- t0 = vld1_u8(pix2 + 4 * stride_pix2);
- t1 = vld1_u8(pix2 + 5 * stride_pix2);
- t2 = vld1_u8(pix2 + 6 * stride_pix2);
- t3 = vld1_u8(pix2 + 7 * stride_pix2);
+static inline int pixel_satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
- v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
- v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
- v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
- v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
+ uint16x8_t sum0, sum1;
+ hadamard_4x4_dual(diff, &sum0);
+ hadamard_4x4_dual(diff + 4, &sum1);
- SUMSUB_AB(v0, v1, v16, v17);
- SUMSUB_AB(v2, v3, v18, v19);
+ sum0 = vaddq_u16(sum0, sum1);
+ return vaddlvq_u16(sum0);
}
-int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint8x8_t t0 = load_u8x4x2(pix1, 2 * stride_pix1);
- uint8x8_t t1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
- uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4_dual(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 8);
- return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)),
- vreinterpretq_s16_u16(vsubl_u8(r1, t1)));
-}
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
+#if X265_DEPTH == 10
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
-int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
-{
- uint8x8_t i0, i1, i2, i3, i4, i5, i6, i7;
+ sum0 = vaddq_u16(sum0, sum1);
- i0 = vld1_u8(pix1 + 0 * stride_pix1);
- i1 = vld1_u8(pix2 + 0 * stride_pix2);
- i2 = vld1_u8(pix1 + 1 * stride_pix1);
- i3 = vld1_u8(pix2 + 1 * stride_pix2);
- i4 = vld1_u8(pix1 + 2 * stride_pix1);
- i5 = vld1_u8(pix2 + 2 * stride_pix2);
- i6 = vld1_u8(pix1 + 3 * stride_pix1);
- i7 = vld1_u8(pix2 + 3 * stride_pix2);
+ return vaddlvq_u16(sum0);
+#else // X265_DEPTH == 12
+ uint32x4_t sum0 = vpaddlq_u16(out[0]);
+ uint32x4_t sum1 = vpaddlq_u16(out[1]);
+ sum0 = vpadalq_u16(sum0, out[2]);
+ sum1 = vpadalq_u16(sum1, out[3]);
- int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1));
- int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3));
- int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5));
- int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7));
+ sum0 = vaddq_u32(sum0, sum1);
- return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
+ return vaddvq_u32(sum0);
+#endif // X265_DEPTH == 10
}
-int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- uint16x8_t v30, v31;
- int16x8_t v0, v1, v2, v3;
- uint16x8_t t0, t1;
+ uint32x4_t sum[2]= { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
- v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ for (int i = 0; i < 4; ++i)
+ {
+ load_diff_u16x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u16x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
- _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ hadamard_4x4_quad(diff, out);
- _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ sum[0] = vpadalq_u16(sum[0], out[0]);
+ sum[1] = vpadalq_u16(sum[1], out[1]);
- _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
- v30 = vaddq_u16(v30, t0);
- v31 = vaddq_u16(v31, t1);
+ pix1 += 4 * stride_pix1;
+ pix2 += 4 * stride_pix2;
+ }
- uint32x4_t sum0 = vpaddlq_u16(v30);
- uint32x4_t sum1 = vpaddlq_u16(v31);
- sum0 = vaddq_u32(sum0, sum1);
- return vaddvq_u32(sum0);
+ return vaddvq_u32(vaddq_u32(sum[0], sum[1]));
}
-#endif //HIGH_BIT_DEPTH
-#if HIGH_BIT_DEPTH
-typedef uint32x4_t sa8d_out_type;
-#else
-typedef uint16x8_t sa8d_out_type;
-#endif
-
-static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
- int16x8_t v3, int16x8_t v20,
- int16x8_t v21, int16x8_t v22,
- int16x8_t v23, sa8d_out_type &out0,
- sa8d_out_type &out1)
+static inline int pixel_sa8d_8x8_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v16, v17, v18, v19;
- int16x8_t v4, v5, v6, v7;
+ int16x8_t diff[8];
+ uint32x4_t res[2];
- SUMSUB_AB(v16, v18, v0, v2);
- SUMSUB_AB(v17, v19, v1, v3);
+ load_diff_u16x8x4_dual(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
- HADAMARD4_V(v20, v21, v22, v23, v0, v1, v2, v3);
+ uint32x4_t s = vaddq_u32(res[0], res[1]);
- SUMSUB_AB(v0, v16, v16, v20);
- SUMSUB_AB(v1, v17, v17, v21);
- SUMSUB_AB(v2, v18, v18, v22);
- SUMSUB_AB(v3, v19, v19, v23);
-
- transpose_8h_8h(v20, v21, v16, v17);
- transpose_8h_8h(v4, v5, v0, v1);
- transpose_8h_8h(v22, v23, v18, v19);
- transpose_8h_8h(v6, v7, v2, v3);
+ return (vaddvq_u32(s) + 1) >> 1;
+}
-#if (X265_DEPTH <= 10)
+static inline int pixel_sa8d_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1,
+ const uint16_t *pix2, intptr_t stride_pix2)
+{
+ uint32x4_t sum0, sum1;
- int16x8_t v24, v25;
+ int16x8_t diff[8];
+ uint32x4_t res[2];
- SUMSUB_AB(v2, v3, v20, v21);
- SUMSUB_AB(v24, v25, v4, v5);
- SUMSUB_AB(v0, v1, v22, v23);
- SUMSUB_AB(v4, v5, v6, v7);
+ load_diff_u16x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(res[0], res[1]);
- transpose_4s_8h(v20, v22, v2, v0);
- transpose_4s_8h(v21, v23, v3, v1);
- transpose_4s_8h(v16, v18, v24, v4);
- transpose_4s_8h(v17, v19, v25, v5);
+ load_diff_u16x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum1 = vaddq_u32(res[0], res[1]);
- SUMSUB_AB(v0, v2, v20, v22);
- SUMSUB_AB(v1, v3, v21, v23);
- SUMSUB_AB(v4, v6, v16, v18);
- SUMSUB_AB(v5, v7, v17, v19);
+ load_diff_u16x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(sum0, res[0]);
+ sum1 = vaddq_u32(sum1, res[1]);
- transpose_2d_8h(v16, v20, v0, v4);
- transpose_2d_8h(v17, v21, v1, v5);
- transpose_2d_8h(v18, v22, v2, v6);
- transpose_2d_8h(v19, v23, v3, v7);
+ load_diff_u16x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+ pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vaddq_u32(sum0, res[0]);
+ sum1 = vaddq_u32(sum1, res[1]);
- uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16));
- uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17));
- uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18));
- uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19));
- uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20));
- uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21));
- uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22));
- uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23));
+ sum0 = vaddq_u32(sum0, sum1);
- uint16x8_t max0 = vmaxq_u16(abs0, abs4);
- uint16x8_t max1 = vmaxq_u16(abs1, abs5);
- uint16x8_t max2 = vmaxq_u16(abs2, abs6);
- uint16x8_t max3 = vmaxq_u16(abs3, abs7);
+ return (vaddvq_u32(sum0) + 1) >> 1;
+}
-#if HIGH_BIT_DEPTH
- out0 = vpaddlq_u16(max0);
- out1 = vpaddlq_u16(max1);
- out0 = vpadalq_u16(out0, max2);
- out1 = vpadalq_u16(out1, max3);
-
-#else //HIGH_BIT_DEPTH
-
- out0 = vaddq_u16(max0, max1);
- out1 = vaddq_u16(max2, max3);
-
-#endif //HIGH_BIT_DEPTH
-
-#else // HIGH_BIT_DEPTH 12 bit only, switching math to int32, each int16x8 is up-convreted to 2 int32x4 (low and high)
-
- int32x4_t v2l, v2h, v3l, v3h, v24l, v24h, v25l, v25h, v0l, v0h, v1l, v1h;
- int32x4_t v22l, v22h, v23l, v23h;
- int32x4_t v4l, v4h, v5l, v5h;
- int32x4_t v6l, v6h, v7l, v7h;
- int32x4_t v16l, v16h, v17l, v17h;
- int32x4_t v18l, v18h, v19l, v19h;
- int32x4_t v20l, v20h, v21l, v21h;
-
- ISUMSUB_AB_FROM_INT16(v2l, v2h, v3l, v3h, v20, v21);
- ISUMSUB_AB_FROM_INT16(v24l, v24h, v25l, v25h, v4, v5);
-
- v22l = vmovl_s16(vget_low_s16(v22));
- v22h = vmovl_high_s16(v22);
- v23l = vmovl_s16(vget_low_s16(v23));
- v23h = vmovl_high_s16(v23);
-
- ISUMSUB_AB(v0l, v1l, v22l, v23l);
- ISUMSUB_AB(v0h, v1h, v22h, v23h);
-
- v6l = vmovl_s16(vget_low_s16(v6));
- v6h = vmovl_high_s16(v6);
- v7l = vmovl_s16(vget_low_s16(v7));
- v7h = vmovl_high_s16(v7);
-
- ISUMSUB_AB(v4l, v5l, v6l, v7l);
- ISUMSUB_AB(v4h, v5h, v6h, v7h);
-
- transpose_2d_4s(v20l, v22l, v2l, v0l);
- transpose_2d_4s(v21l, v23l, v3l, v1l);
- transpose_2d_4s(v16l, v18l, v24l, v4l);
- transpose_2d_4s(v17l, v19l, v25l, v5l);
-
- transpose_2d_4s(v20h, v22h, v2h, v0h);
- transpose_2d_4s(v21h, v23h, v3h, v1h);
- transpose_2d_4s(v16h, v18h, v24h, v4h);
- transpose_2d_4s(v17h, v19h, v25h, v5h);
-
- ISUMSUB_AB(v0l, v2l, v20l, v22l);
- ISUMSUB_AB(v1l, v3l, v21l, v23l);
- ISUMSUB_AB(v4l, v6l, v16l, v18l);
- ISUMSUB_AB(v5l, v7l, v17l, v19l);
-
- ISUMSUB_AB(v0h, v2h, v20h, v22h);
- ISUMSUB_AB(v1h, v3h, v21h, v23h);
- ISUMSUB_AB(v4h, v6h, v16h, v18h);
- ISUMSUB_AB(v5h, v7h, v17h, v19h);
-
- v16l = v0l;
- v16h = v4l;
- v20l = v0h;
- v20h = v4h;
-
- v17l = v1l;
- v17h = v5l;
- v21l = v1h;
- v21h = v5h;
-
- v18l = v2l;
- v18h = v6l;
- v22l = v2h;
- v22h = v6h;
-
- v19l = v3l;
- v19h = v7l;
- v23l = v3h;
- v23h = v7h;
-
- uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l));
- uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l));
- uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l));
- uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l));
- uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l));
- uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l));
- uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l));
- uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l));
-
- uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h));
- uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h));
- uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h));
- uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h));
- uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h));
- uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h));
- uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h));
- uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h));
-
- uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo);
- uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo);
- uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo);
- uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo);
-
- uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi);
- uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi);
- uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi);
- uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi);
-
- uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi);
- uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi);
- uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi);
- uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi);
-
- out0 = vaddq_u32(sum0, sum1);
- out1 = vaddq_u32(sum2, sum3);
+#else // !HIGH_BIT_DEPTH
+static inline int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ uint8x8_t s0 = load_u8x4x2(pix1, 2 * stride_pix1);
+ uint8x8_t s1 = load_u8x4x2(pix1 + stride_pix1, 2 * stride_pix1);
+ uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
+ uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
-#endif
+ int16x8_t diff0 = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+ int16x8_t diff1 = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+ return hadamard_4x4(diff0, diff1);
}
-
-
-static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2,
- int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
+static inline int pixel_satd_4x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
+ int16x8_t diff[4];
+
+ uint8x8_t s0 = load_u8x4x2(pix1 + 0 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s1 = load_u8x4x2(pix1 + 1 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s2 = load_u8x4x2(pix1 + 2 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t s3 = load_u8x4x2(pix1 + 3 * stride_pix1, 4 * stride_pix1);
+ uint8x8_t r0 = load_u8x4x2(pix2 + 0 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r1 = load_u8x4x2(pix2 + 1 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r2 = load_u8x4x2(pix2 + 2 * stride_pix2, 4 * stride_pix2);
+ uint8x8_t r3 = load_u8x4x2(pix2 + 3 * stride_pix2, 4 * stride_pix2);
+
+ diff[0] = vreinterpretq_s16_u16(vsubl_u8(s0, r0));
+ diff[1] = vreinterpretq_s16_u16(vsubl_u8(r1, s1));
+ diff[2] = vreinterpretq_s16_u16(vsubl_u8(s2, r2));
+ diff[3] = vreinterpretq_s16_u16(vsubl_u8(r3, s3));
+
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+
+ return vaddlvq_u16(out);
+}
- int16x8_t v20, v21, v22, v23;
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _satd_8x4v_8x8h_neon(v0, v1, v2, v3, v20, v21, v22, v23);
+static inline int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[4];
-}
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ uint16x8_t out;
+ hadamard_4x4_dual(diff, &out);
+ return vaddlvq_u16(out);
+}
-int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
+ int16x8_t diff[8];
+ uint16x8_t out[2];
- _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
- uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
- uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
-#if !(HIGH_BIT_DEPTH)
- uint16x8_t sum = vaddq_u16(v30, v31);
- return vaddvq_u32(vpaddlq_u16(sum));
-#else
- uint32x4_t sum = vpaddlq_u16(v30);
- sum = vpadalq_u16(sum, v31);
- return vaddvq_u32(sum);
-#endif
-}
+ out[0] = vaddq_u16(out[0], out[1]);
+ return vaddlvq_u16(out[0]);
+}
-int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_8x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
- int16x8_t v20, v21, v22, v23;
- sa8d_out_type res0, res1;
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
-
-#if HIGH_BIT_DEPTH
- uint32x4_t s = vaddq_u32(res0, res1);
- return (vaddvq_u32(s) + 1) >> 1;
-#else
- return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1;
-#endif
-}
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff + 8);
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
+ sum0 = vaddq_u16(sum0, sum1);
+ return vaddlvq_u16(sum0);
+}
-int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
+static inline int pixel_satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v0, v1, v2, v3;
- int16x8_t v20, v21, v22, v23;
- sa8d_out_type res0, res1;
- uint32x4_t v30, v31;
+ int16x8_t diff[8];
- _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ load_diff_u8x8x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x4(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 4);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpaddlq_u16(res0);
- v31 = vpaddlq_u16(res1);
-#else
- v30 = vaddq_u32(res0, res1);
-#endif
+ uint16x8_t out[2];
+ hadamard_4x4_dual(diff, &out[0]);
+ hadamard_4x4_dual(diff + 4, &out[1]);
- _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ out[0] = vaddq_u16(out[0], out[1]);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v31 = vaddq_u32(res0, res1);
-#endif
+ return vaddlvq_u16(out[0]);
+}
+static inline int pixel_satd_16x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[16];
+ uint16x8_t out[4];
- _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
- v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff + 8);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v30 = vaddq_u32(v30, res0);
- v31 = vaddq_u32(v31, res1);
-#endif
+ hadamard_4x4_quad(diff, out);
+ hadamard_4x4_quad(diff + 8, out + 2);
- _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
- v22, v23);
- _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
+ uint16x8_t sum0 = vaddq_u16(out[0], out[1]);
+ uint16x8_t sum1 = vaddq_u16(out[2], out[3]);
-#if !(HIGH_BIT_DEPTH)
- v30 = vpadalq_u16(v30, res0);
- v31 = vpadalq_u16(v31, res1);
-#else
- v30 = vaddq_u32(v30, res0);
- v31 = vaddq_u32(v31, res1);
-#endif
+ sum0 = vaddq_u16(sum0, sum1);
+
+ return vaddlvq_u16(sum0);
+}
+
+static inline int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ uint16x8_t sum[2], out[2];
+ int16x8_t diff[8];
+
+ load_diff_u8x16x4(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = out[0];
+ sum[1] = out[1];
+
+ load_diff_u8x16x4(pix1 + 4 * stride_pix1, stride_pix1,
+ pix2 + 4 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ load_diff_u8x16x4(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ load_diff_u8x16x4(pix1 + 12 * stride_pix1, stride_pix1,
+ pix2 + 12 * stride_pix2, stride_pix2, diff);
+ hadamard_4x4_quad(diff, out);
+ sum[0] = vaddq_u16(sum[0], out[0]);
+ sum[1] = vaddq_u16(sum[1], out[1]);
+
+ uint32x4_t sum0 = vpaddlq_u16(sum[0]);
+ uint32x4_t sum1 = vpaddlq_u16(sum[1]);
- v30 = vaddq_u32(v30, v31);
+ sum0 = vaddq_u32(sum0, sum1);
- return (vaddvq_u32(v30) + 1) >> 1;
+ return vaddvq_u32(sum0);
}
+static inline int pixel_sa8d_8x8_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t res[2];
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ return (vaddlvq_u16(vaddq_u16(res[0], res[1])) + 1) >> 1;
+}
+static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1,
+ const uint8_t *pix2, intptr_t stride_pix2)
+{
+ int16x8_t diff[8];
+ uint16x8_t res[2];
+ uint32x4_t sum0, sum1;
+
+ load_diff_u8x8x8(pix1, stride_pix1, pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpaddlq_u16(res[0]);
+ sum1 = vpaddlq_u16(res[1]);
+
+ load_diff_u8x8x8(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1, stride_pix1,
+ pix2 + 8 * stride_pix2, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+
+ load_diff_u8x8x8(pix1 + 8 * stride_pix1 + 8, stride_pix1,
+ pix2 + 8 * stride_pix2 + 8, stride_pix2, diff);
+ hadamard_8x8(diff, res);
+ sum0 = vpadalq_u16(sum0, res[0]);
+ sum1 = vpadalq_u16(sum1, res[1]);
+ sum0 = vaddq_u32(sum0, sum1);
+ return (vaddvq_u32(sum0) + 1) >> 1;
+}
+#endif // HIGH_BIT_DEPTH
template<int size>
void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
@@ -1425,7 +1431,7 @@ int psyCost_pp_neon(const pixel *source, intptr_t sstride, const pixel *recon, i
template<int w, int h>
// Calculate sa8d in blocks of 8x8
-int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d8_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
{
int cost = 0;
@@ -1440,7 +1446,7 @@ int sa8d8(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2
template<int w, int h>
// Calculate sa8d in blocks of 16x16
-int sa8d16(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
+int sa8d16_neon(const pixel *pix1, intptr_t i_pix1, const pixel *pix2, intptr_t i_pix2)
{
int cost = 0;
@@ -1474,42 +1480,63 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
template<int w, int h>
-// calculate satd in blocks of 4x4
int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
int satd = 0;
- for (int row = 0; row < h; row += 4)
- for (int col = 0; col < w; col += 4)
- satd += pixel_satd_4x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
- pix2 + row * stride_pix2 + col, stride_pix2);
+ if (w == 4 && h == 4) {
+ satd = pixel_satd_4x4_neon(pix1, stride_pix1, pix2, stride_pix2);
+ } else {
+ for (int row = 0; row < h; row += 8)
+ for (int col = 0; col < w; col += 4)
+ satd += pixel_satd_4x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
return satd;
}
template<int w, int h>
-// calculate satd in blocks of 8x4
int satd8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
int satd = 0;
- if (((w | h) & 15) == 0)
+ if (w % 16 == 0 && h % 16 == 0)
{
for (int row = 0; row < h; row += 16)
for (int col = 0; col < w; col += 16)
satd += pixel_satd_16x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
-
}
- else if (((w | h) & 7) == 0)
+ else if (w % 8 == 0 && h % 16 == 0)
+ {
+ for (int row = 0; row < h; row += 16)
+ for (int col = 0; col < w; col += 8)
+ satd += pixel_satd_8x16_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 16 == 0 && h % 8 == 0)
+ {
+ for (int row = 0; row < h; row += 8)
+ for (int col = 0; col < w; col += 16)
+ satd += pixel_satd_16x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 16 == 0 && h % 4 == 0)
+ {
+ for (int row = 0; row < h; row += 4)
+ for (int col = 0; col < w; col += 16)
+ satd += pixel_satd_16x4_neon(pix1 + row * stride_pix1 + col, stride_pix1,
+ pix2 + row * stride_pix2 + col, stride_pix2);
+ }
+ else if (w % 8 == 0 && h % 8 == 0)
{
for (int row = 0; row < h; row += 8)
for (int col = 0; col < w; col += 8)
satd += pixel_satd_8x8_neon(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
-
}
- else
+ else // w multiple of 8, h multiple of 4
{
for (int row = 0; row < h; row += 4)
for (int col = 0; col < w; col += 8)
@@ -1634,38 +1661,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_PU(64, 16);
LUMA_PU(16, 64);
- p.pu[LUMA_4x4].satd = pixel_satd_4x4_neon;
- p.pu[LUMA_8x4].satd = pixel_satd_8x4_neon;
-
+ p.pu[LUMA_4x4].satd = satd4_neon<4, 4>;
+ p.pu[LUMA_4x8].satd = satd4_neon<4, 8>;
+ p.pu[LUMA_4x16].satd = satd4_neon<4, 16>;
+ p.pu[LUMA_8x4].satd = satd8_neon<8, 4>;
p.pu[LUMA_8x8].satd = satd8_neon<8, 8>;
- p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
- p.pu[LUMA_16x8].satd = satd8_neon<16, 8>;
p.pu[LUMA_8x16].satd = satd8_neon<8, 16>;
- p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+ p.pu[LUMA_8x32].satd = satd8_neon<8, 32>;
+ p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
p.pu[LUMA_16x4].satd = satd8_neon<16, 4>;
- p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
- p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+ p.pu[LUMA_16x8].satd = satd8_neon<16, 8>;
+ p.pu[LUMA_16x12].satd = satd8_neon<16, 12>;
+ p.pu[LUMA_16x16].satd = satd8_neon<16, 16>;
p.pu[LUMA_16x32].satd = satd8_neon<16, 32>;
- p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+ p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
p.pu[LUMA_24x32].satd = satd8_neon<24, 32>;
p.pu[LUMA_32x8].satd = satd8_neon<32, 8>;
- p.pu[LUMA_8x32].satd = satd8_neon<8, 32>;
- p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
- p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+ p.pu[LUMA_32x16].satd = satd8_neon<32, 16>;
+ p.pu[LUMA_32x24].satd = satd8_neon<32, 24>;
+ p.pu[LUMA_32x32].satd = satd8_neon<32, 32>;
p.pu[LUMA_32x64].satd = satd8_neon<32, 64>;
- p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
p.pu[LUMA_48x64].satd = satd8_neon<48, 64>;
p.pu[LUMA_64x16].satd = satd8_neon<64, 16>;
- p.pu[LUMA_16x64].satd = satd8_neon<16, 64>;
-
-#if HIGH_BIT_DEPTH
- p.pu[LUMA_4x8].satd = satd4_neon<4, 8>;
- p.pu[LUMA_4x16].satd = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
- p.pu[LUMA_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+ p.pu[LUMA_64x32].satd = satd8_neon<64, 32>;
+ p.pu[LUMA_64x48].satd = satd8_neon<64, 48>;
+ p.pu[LUMA_64x64].satd = satd8_neon<64, 64>;
LUMA_CU(4, 4);
@@ -1673,7 +1693,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_CU(16, 16);
LUMA_CU(32, 32);
LUMA_CU(64, 64);
-
+
#if !(HIGH_BIT_DEPTH)
p.cu[BLOCK_8x8].var = pixel_var_neon<8>;
p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
@@ -1697,17 +1717,17 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_8x8].calcresidual[ALIGNED] = getResidual_neon<8>;
p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = getResidual_neon<16>;
p.cu[BLOCK_16x16].calcresidual[ALIGNED] = getResidual_neon<16>;
-
+
#if defined(__APPLE__)
p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = getResidual_neon<32>;
p.cu[BLOCK_32x32].calcresidual[ALIGNED] = getResidual_neon<32>;
#endif // defined(__APPLE__)
- p.cu[BLOCK_4x4].sa8d = pixel_satd_4x4_neon;
- p.cu[BLOCK_8x8].sa8d = pixel_sa8d_8x8_neon;
- p.cu[BLOCK_16x16].sa8d = pixel_sa8d_16x16_neon;
- p.cu[BLOCK_32x32].sa8d = sa8d16<32, 32>;
- p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
+ p.cu[BLOCK_4x4].sa8d = satd4_neon<4, 4>;
+ p.cu[BLOCK_8x8].sa8d = sa8d8_neon<8, 8>;
+ p.cu[BLOCK_16x16].sa8d = sa8d16_neon<16, 16>;
+ p.cu[BLOCK_32x32].sa8d = sa8d16_neon<32, 32>;
+ p.cu[BLOCK_64x64].sa8d = sa8d16_neon<64, 64>;
#define CHROMA_PU_420(W, H) \
@@ -1743,38 +1763,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x2].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = pixel_satd_4x4_neon;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8_neon<8, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = pixel_satd_8x4_neon;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8_neon<16, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8_neon<8, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].satd = NULL;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd4_neon<16, 12>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd4_neon<16, 4>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8_neon<32, 8>;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8_neon<8, 32>;
-
-#if HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = satd4_neon<4, 4>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd4_neon<4, 8>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd4_neon<4, 16>;
-#endif // HIGH_BIT_DEPTH
-
-#if !defined(__APPLE__) || HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = satd8_neon<8, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].satd = NULL;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd8_neon<8, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd8_neon<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd8_neon<8, 32>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd4_neon<12, 16>;
-#endif // !defined(__APPLE__)
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd8_neon<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd8_neon<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd8_neon<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd8_neon<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd8_neon<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd8_neon<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd8_neon<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd8_neon<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
#define CHROMA_CU_420(W, H) \
@@ -1783,7 +1795,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
+
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1799,9 +1811,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd;
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8<8, 8>;
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16<16, 16>;
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 8>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 16>;
+ p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 32>;
#define CHROMA_PU_422(W, H) \
@@ -1837,34 +1849,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8_neon<8, 16>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = pixel_satd_4x4_neon;
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = satd4_neon<4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4_neon<4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4_neon<4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4_neon<4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd8_neon<8, 4>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd8_neon<8, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd8_neon<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd8_neon<8, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd8_neon<8, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd4_neon<8, 4>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].satd = NULL;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8_neon<16, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd4_neon<8, 12>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd8_neon<8, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd8_neon<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd8_neon<16, 16>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd8_neon<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd8_neon<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd8_neon<16, 64>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd8_neon<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd8_neon<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd8_neon<32, 32>;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd8_neon<32, 48>;
-
-#if HIGH_BIT_DEPTH
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd4_neon<4, 8>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd4_neon<4, 16>;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd4_neon<4, 32>;
-#endif // HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd8_neon<32, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
#define CHROMA_CU_422(W, H) \
@@ -1887,10 +1896,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
CHROMA_CU_422(16, 32)
CHROMA_CU_422(32, 64)
- p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
- p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8<8, 16>;
- p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
- p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_8x8].sa8d = p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd;
+ p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d8_neon<8, 16>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16_neon<16, 32>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16_neon<32, 64>;
+
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = sa8d8_neon<8, 16>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = sa8d16_neon<16, 32>;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = sa8d16_neon<32, 64>;
p.weight_pp = weight_pp_neon;
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 106ba903a..856c12862 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -56,261 +56,3 @@ function PFX(pixel_sub_ps_8x16_sve)
ret
endfunc
-//******* satd *******
-.macro satd_4x4_sve
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z2.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z1.h}, p0/z, [x0]
- ld1b {z3.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z6.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z5.h}, p0/z, [x0]
- ld1b {z7.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
-
- sub z0.h, z0.h, z2.h
- sub z1.h, z1.h, z3.h
- sub z2.h, z4.h, z6.h
- sub z3.h, z5.h, z7.h
-
- add z4.h, z0.h, z2.h
- add z5.h, z1.h, z3.h
- sub z6.h, z0.h, z2.h
- sub z7.h, z1.h, z3.h
-
- add z0.h, z4.h, z5.h
- sub z1.h, z4.h, z5.h
-
- add z2.h, z6.h, z7.h
- sub z3.h, z6.h, z7.h
-
- trn1 z4.h, z0.h, z2.h
- trn2 z5.h, z0.h, z2.h
-
- trn1 z6.h, z1.h, z3.h
- trn2 z7.h, z1.h, z3.h
-
- add z0.h, z4.h, z5.h
- sub z1.h, z4.h, z5.h
-
- add z2.h, z6.h, z7.h
- sub z3.h, z6.h, z7.h
-
- trn1 z4.s, z0.s, z1.s
- trn2 z5.s, z0.s, z1.s
-
- trn1 z6.s, z2.s, z3.s
- trn2 z7.s, z2.s, z3.s
-
- abs z4.h, p0/m, z4.h
- abs z5.h, p0/m, z5.h
- abs z6.h, p0/m, z6.h
- abs z7.h, p0/m, z7.h
-
- smax z4.h, p0/m, z4.h, z5.h
- smax z6.h, p0/m, z6.h, z7.h
-
- add z0.h, z4.h, z6.h
-
- uaddlp v0.2s, v0.4h
- uaddlp v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_sve)
- ptrue p0.h, vl4
- satd_4x4_sve
- fmov x0, d0
- ret
-endfunc
-
-function PFX(pixel_satd_8x4_sve)
- ptrue p0.h, vl4
- mov x4, x0
- mov x5, x2
- satd_4x4_sve
- add x0, x4, #4
- add x2, x5, #4
- umov x6, v0.d[0]
- satd_4x4_sve
- umov x0, v0.d[0]
- add x0, x0, x6
- ret
-endfunc
-
-function PFX(pixel_satd_8x12_sve)
- ptrue p0.h, vl4
- mov x4, x0
- mov x5, x2
- mov x7, #0
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
-.rept 2
- sub x0, x0, #4
- sub x2, x2, #4
- mov x4, x0
- mov x5, x2
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_sve
- umov x6, v0.d[0]
- add x7, x7, x6
-.endr
- mov x0, x7
- ret
-endfunc
-
-.macro LOAD_DIFF_16x4_sve v0 v1 v2 v3 v4 v5 v6 v7
- mov x11, #8 // in order to consider CPUs whose vector size is greater than 128 bits
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z1.h}, p0/z, [x0, x11]
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z5.h}, p0/z, [x0, x11]
- ld1b {z6.h}, p0/z, [x2]
- ld1b {z7.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- sub \v0\().h, z0.h, z2.h
- sub \v4\().h, z1.h, z3.h
- sub \v1\().h, z4.h, z6.h
- sub \v5\().h, z5.h, z7.h
-
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z1.h}, p0/z, [x0, x11]
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- ld1b {z4.h}, p0/z, [x0]
- ld1b {z5.h}, p0/z, [x0, x11]
- ld1b {z6.h}, p0/z, [x2]
- ld1b {z7.h}, p0/z, [x2, x11]
- add x0, x0, x1
- add x2, x2, x3
- sub \v2\().h, z0.h, z2.h
- sub \v6\().h, z1.h, z3.h
- sub \v3\().h, z4.h, z6.h
- sub \v7\().h, z5.h, z7.h
-.endm
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_sve), export=0
- HADAMARD4_V z16.h, z18.h, z17.h, z19.h, z0.h, z2.h, z1.h, z3.h
- HADAMARD4_V z20.h, z21.h, z22.h, z23.h, z0.h, z1.h, z2.h, z3.h
- trn4 z0.h, z1.h, z2.h, z3.h, z16.h, z17.h, z18.h, z19.h
- trn4 z4.h, z5.h, z6.h, z7.h, z20.h, z21.h, z22.h, z23.h
- SUMSUB_ABCD z16.h, z17.h, z18.h, z19.h, z0.h, z1.h, z2.h, z3.h
- SUMSUB_ABCD z20.h, z21.h, z22.h, z23.h, z4.h, z5.h, z6.h, z7.h
- trn4 z0.s, z2.s, z1.s, z3.s, z16.s, z18.s, z17.s, z19.s
- trn4 z4.s, z6.s, z5.s, z7.s, z20.s, z22.s, z21.s, z23.s
- ABS8_SVE z0.h, z1.h, z2.h, z3.h, z4.h, z5.h, z6.h, z7.h, p0
- smax z0.h, p0/m, z0.h, z2.h
- smax z1.h, p0/m, z1.h, z3.h
- smax z4.h, p0/m, z4.h, z6.h
- smax z5.h, p0/m, z5.h, z7.h
- ret
-endfunc
-
-function PFX(satd_16x4_sve), export=0
- LOAD_DIFF_16x4_sve z16, z17, z18, z19, z20, z21, z22, z23
- b PFX(satd_8x4v_8x8h_sve)
-endfunc
-
-.macro pixel_satd_32x8_sve
- mov x4, x0
- mov x5, x2
-.rept 2
- bl PFX(satd_16x4_sve)
- add z30.h, z30.h, z0.h
- add z31.h, z31.h, z1.h
- add z30.h, z30.h, z4.h
- add z31.h, z31.h, z5.h
-.endr
- add x0, x4, #16
- add x2, x5, #16
-.rept 2
- bl PFX(satd_16x4_sve)
- add z30.h, z30.h, z0.h
- add z31.h, z31.h, z1.h
- add z30.h, z30.h, z4.h
- add z31.h, z31.h, z5.h
-.endr
-.endm
-
-.macro satd_32x16_sve
- movi v30.2d, #0
- movi v31.2d, #0
- pixel_satd_32x8_sve
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8_sve
- add z0.h, z30.h, z31.h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
-.endm
-
-function PFX(pixel_satd_32x16_sve)
- ptrue p0.h, vl8
- mov x10, x30
- satd_32x16_sve
- mov x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x32_sve)
- ptrue p0.h, vl8
- mov x10, x30
- mov x7, #0
- satd_32x16_sve
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
- satd_32x16_sve
- add x0, x7, x6
- ret x10
-endfunc
-
-.macro satd_64x16_sve
- mov x8, x0
- mov x9, x2
- satd_32x16_sve
- add x7, x7, x6
- add x0, x8, #32
- add x2, x9, #32
- satd_32x16_sve
- add x7, x7, x6
-.endm
-
-function PFX(pixel_satd_64x48_sve)
- ptrue p0.h, vl8
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_64x16_sve
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_sve
- mov x0, x7
- ret x10
-endfunc
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..e189fdcd7 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,963 +565,6 @@ function PFX(scale2D_64to32_neon)
ret
endfunc
-//******* satd *******
-.macro satd_4x4_neon
- ldr s0, [x0]
- ldr s1, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[1], [x2], x3
-
- ldr s2, [x0]
- ldr s3, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1 {v2.s}[1], [x0], x1
- ld1 {v3.s}[1], [x2], x3
-
- usubl v4.8h, v0.8b, v1.8b
- usubl v5.8h, v2.8b, v3.8b
-
- add v6.8h, v4.8h, v5.8h
- sub v7.8h, v4.8h, v5.8h
-
- mov v4.d[0], v6.d[1]
- add v0.4h, v6.4h, v4.4h
- sub v2.4h, v6.4h, v4.4h
-
- mov v5.d[0], v7.d[1]
- add v1.4h, v7.4h, v5.4h
- sub v3.4h, v7.4h, v5.4h
-
- trn1 v4.4h, v0.4h, v1.4h
- trn2 v5.4h, v0.4h, v1.4h
-
- trn1 v6.4h, v2.4h, v3.4h
- trn2 v7.4h, v2.4h, v3.4h
-
- add v0.4h, v4.4h, v5.4h
- sub v1.4h, v4.4h, v5.4h
-
- add v2.4h, v6.4h, v7.4h
- sub v3.4h, v6.4h, v7.4h
-
- trn1 v4.2s, v0.2s, v1.2s
- trn2 v5.2s, v0.2s, v1.2s
-
- trn1 v6.2s, v2.2s, v3.2s
- trn2 v7.2s, v2.2s, v3.2s
-
- abs v4.4h, v4.4h
- abs v5.4h, v5.4h
- abs v6.4h, v6.4h
- abs v7.4h, v7.4h
-
- smax v1.4h, v4.4h, v5.4h
- smax v2.4h, v6.4h, v7.4h
-
- add v0.4h, v1.4h, v2.4h
- uaddlp v0.2s, v0.4h
- uaddlp v0.1d, v0.2s
-.endm
-
-// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x4_neon)
- satd_4x4_neon
- fmov x0, d0
- ret
-endfunc
-
-.macro x265_satd_4x8_8x4_end_neon
- add v0.8h, v4.8h, v6.8h
- add v1.8h, v5.8h, v7.8h
- sub v2.8h, v4.8h, v6.8h
- sub v3.8h, v5.8h, v7.8h
-
- trn1 v16.8h, v0.8h, v1.8h
- trn2 v17.8h, v0.8h, v1.8h
- add v4.8h, v16.8h, v17.8h
- trn1 v18.8h, v2.8h, v3.8h
- trn2 v19.8h, v2.8h, v3.8h
- sub v5.8h, v16.8h, v17.8h
- add v6.8h, v18.8h, v19.8h
- sub v7.8h, v18.8h, v19.8h
- trn1 v0.4s, v4.4s, v6.4s
- trn2 v2.4s, v4.4s, v6.4s
- abs v0.8h, v0.8h
- trn1 v1.4s, v5.4s, v7.4s
- trn2 v3.4s, v5.4s, v7.4s
- abs v2.8h, v2.8h
- abs v1.8h, v1.8h
- abs v3.8h, v3.8h
- umax v0.8h, v0.8h, v2.8h
- umax v1.8h, v1.8h, v3.8h
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
-.endm
-
-.macro pixel_satd_4x8_neon
- ld1r {v1.2s}, [x2], x3
- ld1r {v0.2s}, [x0], x1
- ld1r {v3.2s}, [x2], x3
- ld1r {v2.2s}, [x0], x1
- ld1r {v5.2s}, [x2], x3
- ld1r {v4.2s}, [x0], x1
- ld1r {v7.2s}, [x2], x3
- ld1r {v6.2s}, [x0], x1
-
- ld1 {v1.s}[1], [x2], x3
- ld1 {v0.s}[1], [x0], x1
- usubl v0.8h, v0.8b, v1.8b
- ld1 {v3.s}[1], [x2], x3
- ld1 {v2.s}[1], [x0], x1
- usubl v1.8h, v2.8b, v3.8b
- ld1 {v5.s}[1], [x2], x3
- ld1 {v4.s}[1], [x0], x1
- usubl v2.8h, v4.8b, v5.8b
- ld1 {v7.s}[1], [x2], x3
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
- ld1 {v6.s}[1], [x0], x1
- usubl v3.8h, v6.8b, v7.8b
- add v6.8h, v2.8h, v3.8h
- sub v7.8h, v2.8h, v3.8h
- x265_satd_4x8_8x4_end_neon
-.endm
-
-// template<int w, int h>
-// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
-function PFX(pixel_satd_4x8_neon)
- pixel_satd_4x8_neon
- mov w0, v0.s[0]
- ret
-endfunc
-
-function PFX(pixel_satd_4x16_neon)
- mov w4, #0
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w4, w4, w5
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w0, w5, w4
- ret
-endfunc
-
-function PFX(pixel_satd_4x32_neon)
- mov w4, #0
-.rept 4
- pixel_satd_4x8_neon
- mov w5, v0.s[0]
- add w4, w4, w5
-.endr
- mov w0, w4
- ret
-endfunc
-
-function PFX(pixel_satd_12x16_neon)
- mov x4, x0
- mov x5, x2
- mov w7, #0
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-
- add x0, x4, #4
- add x2, x5, #4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-
- add x0, x4, #8
- add x2, x5, #8
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w0, w7, w6
- ret
-endfunc
-
-function PFX(pixel_satd_12x32_neon)
- mov x4, x0
- mov x5, x2
- mov w7, #0
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- add x0, x4, #4
- add x2, x5, #4
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- add x0, x4, #8
- add x2, x5, #8
-.rept 4
- pixel_satd_4x8_neon
- mov w6, v0.s[0]
- add w7, w7, w6
-.endr
-
- mov w0, w7
- ret
-endfunc
-
-function PFX(pixel_satd_8x4_neon)
- mov x4, x0
- mov x5, x2
- satd_4x4_neon
- add x0, x4, #4
- add x2, x5, #4
- umov x6, v0.d[0]
- satd_4x4_neon
- umov x0, v0.d[0]
- add x0, x0, x6
- ret
-endfunc
-
-.macro LOAD_DIFF_8x4 v0 v1 v2 v3
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x2], x3
- ld1 {v2.8b}, [x0], x1
- ld1 {v3.8b}, [x2], x3
- ld1 {v4.8b}, [x0], x1
- ld1 {v5.8b}, [x2], x3
- ld1 {v6.8b}, [x0], x1
- ld1 {v7.8b}, [x2], x3
- usubl \v0, v0.8b, v1.8b
- usubl \v1, v2.8b, v3.8b
- usubl \v2, v4.8b, v5.8b
- usubl \v3, v6.8b, v7.8b
-.endm
-
-.macro LOAD_DIFF_16x4 v0 v1 v2 v3 v4 v5 v6 v7
- ld1 {v0.16b}, [x0], x1
- ld1 {v1.16b}, [x2], x3
- ld1 {v2.16b}, [x0], x1
- ld1 {v3.16b}, [x2], x3
- ld1 {v4.16b}, [x0], x1
- ld1 {v5.16b}, [x2], x3
- ld1 {v6.16b}, [x0], x1
- ld1 {v7.16b}, [x2], x3
- usubl \v0, v0.8b, v1.8b
- usubl \v1, v2.8b, v3.8b
- usubl \v2, v4.8b, v5.8b
- usubl \v3, v6.8b, v7.8b
- usubl2 \v4, v0.16b, v1.16b
- usubl2 \v5, v2.16b, v3.16b
- usubl2 \v6, v4.16b, v5.16b
- usubl2 \v7, v6.16b, v7.16b
-.endm
-
-function PFX(satd_16x4_neon), export=0
- LOAD_DIFF_16x4 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
- b PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-function PFX(satd_8x8_neon), export=0
- LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
- LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
- b PFX(satd_8x4v_8x8h_neon)
-endfunc
-
-// one vertical hadamard pass and two horizontal
-function PFX(satd_8x4v_8x8h_neon), export=0
- HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
- HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
- trn4 v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
- trn4 v4.8h, v5.8h, v6.8h, v7.8h, v20.8h, v21.8h, v22.8h, v23.8h
- SUMSUB_ABCD v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
- SUMSUB_ABCD v20.8h, v21.8h, v22.8h, v23.8h, v4.8h, v5.8h, v6.8h, v7.8h
- trn4 v0.4s, v2.4s, v1.4s, v3.4s, v16.4s, v18.4s, v17.4s, v19.4s
- trn4 v4.4s, v6.4s, v5.4s, v7.4s, v20.4s, v22.4s, v21.4s, v23.4s
- ABS8 v0.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h
- smax v0.8h, v0.8h, v2.8h
- smax v1.8h, v1.8h, v3.8h
- smax v2.8h, v4.8h, v6.8h
- smax v3.8h, v5.8h, v7.8h
- ret
-endfunc
-
-function PFX(pixel_satd_8x8_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- add v1.8h, v2.8h, v3.8h
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x12_neon)
- mov x4, x0
- mov x5, x2
- mov x7, #0
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
-.rept 2
- sub x0, x0, #4
- sub x2, x2, #4
- mov x4, x0
- mov x5, x2
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
- add x0, x4, #4
- add x2, x5, #4
- satd_4x4_neon
- umov x6, v0.d[0]
- add x7, x7, x6
-.endr
- mov x0, x7
- ret
-endfunc
-
-function PFX(pixel_satd_8x16_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x32_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 3
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_8x64_neon)
- mov x10, x30
- bl PFX(satd_8x8_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 7
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x4_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x8_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x12_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x16_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 3
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x24_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 5
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-.macro pixel_satd_16x32_neon
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 7
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-function PFX(pixel_satd_16x32_neon)
- mov x10, x30
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_16x64_neon)
- mov x10, x30
- bl PFX(satd_16x4_neon)
- add v30.8h, v0.8h, v1.8h
- add v31.8h, v2.8h, v3.8h
-.rept 15
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_24x32_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_24x64_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- sub x4, x4, #24
- sub x5, x5, #24
- add x0, x4, x1, lsl #5
- add x2, x5, x3, lsl #5
- mov x4, x0
- mov x5, x2
-.rept 3
- movi v30.8h, #0
- movi v31.8h, #0
-.rept 4
- bl PFX(satd_8x8_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
- add x4, x4, #8
- add x5, x5, #8
- mov x0, x4
- mov x2, x5
-.endr
- mov x0, x7
- ret x10
-endfunc
-
-.macro pixel_satd_32x8
- mov x4, x0
- mov x5, x2
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
- add x0, x4, #16
- add x2, x5, #16
-.rept 2
- bl PFX(satd_16x4_neon)
- add v30.8h, v30.8h, v0.8h
- add v31.8h, v31.8h, v1.8h
- add v30.8h, v30.8h, v2.8h
- add v31.8h, v31.8h, v3.8h
-.endr
-.endm
-
-.macro satd_32x16_neon
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_32x8
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
-.endm
-
-.macro satd_64x16_neon
- mov x8, x0
- mov x9, x2
- satd_32x16_neon
- add x7, x7, x6
- add x0, x8, #32
- add x2, x9, #32
- satd_32x16_neon
- add x7, x7, x6
-.endm
-
-function PFX(pixel_satd_32x8_neon)
- mov x10, x30
- mov x7, #0
- mov x4, x0
- mov x5, x2
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x16_neon)
- mov x10, x30
- satd_32x16_neon
- mov x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x24_neon)
- mov x10, x30
- satd_32x16_neon
- movi v30.8h, #0
- movi v31.8h, #0
- sub x0, x0, #16
- sub x2, x2, #16
- pixel_satd_32x8
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- add x0, x0, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x32_neon)
- mov x10, x30
- mov x7, #0
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x48_neon)
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_32x64_neon)
- mov x10, x30
- mov x7, #0
-.rept 3
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x16_neon)
- mov x10, x30
- mov x7, #0
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x32_neon)
- mov x10, x30
- mov x7, #0
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x48_neon)
- mov x10, x30
- mov x7, #0
-.rept 2
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_64x64_neon)
- mov x10, x30
- mov x7, #0
-.rept 3
- satd_64x16_neon
- sub x0, x0, #48
- sub x2, x2, #48
-.endr
- satd_64x16_neon
- mov x0, x7
- ret x10
-endfunc
-
-function PFX(pixel_satd_48x64_neon)
- mov x10, x30
- mov x7, #0
- mov x8, x0
- mov x9, x2
-.rept 3
- satd_32x16_neon
- sub x0, x0, #16
- sub x2, x2, #16
- add x7, x7, x6
-.endr
- satd_32x16_neon
- add x7, x7, x6
-
- add x0, x8, #32
- add x2, x9, #32
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x7, x7, x6
-
- movi v30.8h, #0
- movi v31.8h, #0
- pixel_satd_16x32_neon
- add v0.8h, v30.8h, v31.8h
- uaddlv s0, v0.8h
- mov w6, v0.s[0]
- add x0, x7, x6
- ret x10
-endfunc
-
-function PFX(sa8d_8x8_neon), export=0
- LOAD_DIFF_8x4 v16.8h, v17.8h, v18.8h, v19.8h
- LOAD_DIFF_8x4 v20.8h, v21.8h, v22.8h, v23.8h
- HADAMARD4_V v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
- HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
- SUMSUB_ABCD v0.8h, v16.8h, v1.8h, v17.8h, v16.8h, v20.8h, v17.8h, v21.8h
- SUMSUB_ABCD v2.8h, v18.8h, v3.8h, v19.8h, v18.8h, v22.8h, v19.8h, v23.8h
- trn4 v4.8h, v5.8h, v6.8h, v7.8h, v0.8h, v1.8h, v2.8h, v3.8h
- trn4 v20.8h, v21.8h, v22.8h, v23.8h, v16.8h, v17.8h, v18.8h, v19.8h
- SUMSUB_ABCD v2.8h, v3.8h, v24.8h, v25.8h, v20.8h, v21.8h, v4.8h, v5.8h
- SUMSUB_ABCD v0.8h, v1.8h, v4.8h, v5.8h, v22.8h, v23.8h, v6.8h, v7.8h
- trn4 v20.4s, v22.4s, v21.4s, v23.4s, v2.4s, v0.4s, v3.4s, v1.4s
- trn4 v16.4s, v18.4s, v17.4s, v19.4s, v24.4s, v4.4s, v25.4s, v5.4s
- SUMSUB_ABCD v0.8h, v2.8h, v1.8h, v3.8h, v20.8h, v22.8h, v21.8h, v23.8h
- SUMSUB_ABCD v4.8h, v6.8h, v5.8h, v7.8h, v16.8h, v18.8h, v17.8h, v19.8h
- trn4 v16.2d, v20.2d, v17.2d, v21.2d, v0.2d, v4.2d, v1.2d, v5.2d
- trn4 v18.2d, v22.2d, v19.2d, v23.2d, v2.2d, v6.2d, v3.2d, v7.2d
- ABS8 v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
- smax v16.8h, v16.8h, v20.8h
- smax v17.8h, v17.8h, v21.8h
- smax v18.8h, v18.8h, v22.8h
- smax v19.8h, v19.8h, v23.8h
- add v0.8h, v16.8h, v17.8h
- add v1.8h, v18.8h, v19.8h
- ret
-endfunc
-
-function PFX(pixel_sa8d_8x8_neon)
- mov x10, x30
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w0, v0.s[0]
- add w0, w0, #1
- lsr w0, w0, #1
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_8x16_neon)
- mov x10, x30
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w5, v0.s[0]
- add w5, w5, #1
- lsr w5, w5, #1
- bl PFX(sa8d_8x8_neon)
- add v0.8h, v0.8h, v1.8h
- uaddlv s0, v0.8h
- mov w4, v0.s[0]
- add w4, w4, #1
- lsr w4, w4, #1
- add w0, w4, w5
- ret x10
-endfunc
-
-.macro sa8d_16x16 reg
- bl PFX(sa8d_8x8_neon)
- uaddlp v30.4s, v0.8h
- uaddlp v31.4s, v1.8h
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- bl PFX(sa8d_8x8_neon)
- uadalp v30.4s, v0.8h
- uadalp v31.4s, v1.8h
- add v0.4s, v30.4s, v31.4s
- addv s0, v0.4s
- mov \reg, v0.s[0]
- add \reg, \reg, #1
- lsr \reg, \reg, #1
-.endm
-
-function PFX(pixel_sa8d_16x16_neon)
- mov x10, x30
- sa8d_16x16 w0
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_16x32_neon)
- mov x10, x30
- sa8d_16x16 w4
- sub x0, x0, #8
- sub x2, x2, #8
- sa8d_16x16 w5
- add w0, w4, w5
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_32x32_neon)
- mov x10, x30
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- sub x0, x0, #24
- sub x2, x2, #24
- sa8d_16x16 w6
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w7
- add w4, w4, w5
- add w6, w6, w7
- add w0, w4, w6
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_32x64_neon)
- mov x10, x30
- mov w11, #4
- mov w9, #0
-.Loop_sa8d_32:
- sub w11, w11, #1
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- add w4, w4, w5
- add w9, w9, w4
- sub x0, x0, #24
- sub x2, x2, #24
- cbnz w11, .Loop_sa8d_32
- mov w0, w9
- ret x10
-endfunc
-
-function PFX(pixel_sa8d_64x64_neon)
- mov x10, x30
- mov w11, #4
- mov w9, #0
-.Loop_sa8d_64:
- sub w11, w11, #1
- sa8d_16x16 w4
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w5
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w6
- sub x0, x0, x1, lsl #4
- sub x2, x2, x3, lsl #4
- add x0, x0, #8
- add x2, x2, #8
- sa8d_16x16 w7
- add w4, w4, w5
- add w6, w6, w7
- add w8, w4, w6
- add w9, w9, w8
-
- sub x0, x0, #56
- sub x2, x2, #56
- cbnz w11, .Loop_sa8d_64
- mov w0, w9
- ret x10
-endfunc
-
/***** dequant_scaling*****/
// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
function PFX(dequant_scaling_neon)
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list