[x265] [PATCH 04/18] AArch64: Refactor types and conversions in pixel-prim.cpp
Hari Limaye
hari.limaye at arm.com
Tue Aug 13 15:19:25 UTC 2024
Use correct types for all variables/operations and make all vector
conversions explicit in common/aarch64/pixel-prim.cpp.
---
source/common/aarch64/pixel-prim.cpp | 524 ++++++++++++++-------------
1 file changed, 270 insertions(+), 254 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 9598f5498..dd3fd4637 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -36,15 +36,21 @@ static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
const int16x8_t s1, const int16x8_t s2)
{
- t1 = vtrn1q_s32(s1, s2);
- t2 = vtrn2q_s32(s1, s2);
+ int32x4_t tmp1 = vreinterpretq_s32_s16(s1);
+ int32x4_t tmp2 = vreinterpretq_s32_s16(s2);
+
+ t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2));
+ t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2));
}
static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
const int16x8_t s1, const int16x8_t s2)
{
- t1 = vtrn1q_s64(s1, s2);
- t2 = vtrn2q_s64(s1, s2);
+ int64x2_t tmp1 = vreinterpretq_s64_s16(s1);
+ int64x2_t tmp2 = vreinterpretq_s64_s16(s2);
+
+ t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2));
+ t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2));
}
static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
@@ -84,16 +90,16 @@ static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int1
transpose_4s_8h(v0, v1, v16, v18);
transpose_4s_8h(v2, v3, v17, v19);
- v0 = vabsq_s16(v0);
- v1 = vabsq_s16(v1);
- v2 = vabsq_s16(v2);
- v3 = vabsq_s16(v3);
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+ uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+ uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
- v0 = vmaxq_u16(v0, v1);
- v1 = vmaxq_u16(v2, v3);
+ uint16x8_t max0 = vmaxq_u16(abs0, abs1);
+ uint16x8_t max1 = vmaxq_u16(abs2, abs3);
- v0 = vaddq_u16(v0, v1);
- return vaddlvq_u16(v0);
+ uint16x8_t sum = vaddq_u16(max0, max1);
+ return vaddlvq_u16(sum);
}
static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
@@ -109,11 +115,11 @@ static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
transpose_4s_8h(v0, v1, v2, v3);
- v0 = vabsq_s16(v0);
- v1 = vabsq_s16(v1);
- v0 = vmaxq_u16(v0, v1);
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+ uint16x8_t max = vmaxq_u16(abs0, abs1);
- return vaddlvq_s16(v0);
+ return vaddlvq_u16(max);
}
static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20,
@@ -141,20 +147,19 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in
transpose_4s_8h(v4, v6, v20, v22);
transpose_4s_8h(v5, v7, v21, v23);
- v0 = vabsq_s16(v0);
- v1 = vabsq_s16(v1);
- v2 = vabsq_s16(v2);
- v3 = vabsq_s16(v3);
- v4 = vabsq_s16(v4);
- v5 = vabsq_s16(v5);
- v6 = vabsq_s16(v6);
- v7 = vabsq_s16(v7);
-
- v0 = vmaxq_u16(v0, v2);
- v1 = vmaxq_u16(v1, v3);
- v2 = vmaxq_u16(v4, v6);
- v3 = vmaxq_u16(v5, v7);
-
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1));
+ uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2));
+ uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3));
+ uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4));
+ uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5));
+ uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6));
+ uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7));
+
+ v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2));
+ v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3));
+ v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6));
+ v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7));
}
#if HIGH_BIT_DEPTH
@@ -163,8 +168,11 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in
static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
const int32x4_t s1, const int32x4_t s2)
{
- t1 = vtrn1q_s64(s1, s2);
- t2 = vtrn2q_s64(s1, s2);
+ int64x2_t tmp1 = vreinterpretq_s64_s32(s1);
+ int64x2_t tmp2 = vreinterpretq_s64_s32(s2);
+
+ t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2));
+ t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2));
}
static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b)
@@ -203,10 +211,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons
t2 = vld1q_u16(pix2 + 2 * stride_pix2);
t3 = vld1q_u16(pix2 + 3 * stride_pix2);
- v16 = vsubq_u16(r0, t0);
- v17 = vsubq_u16(r1, t1);
- v18 = vsubq_u16(r2, t2);
- v19 = vsubq_u16(r3, t3);
+ v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+ v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+ v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+ v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
r0 = vld1q_u16(pix1 + 4 * stride_pix1);
r1 = vld1q_u16(pix1 + 5 * stride_pix1);
@@ -218,10 +226,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons
t2 = vld1q_u16(pix2 + 6 * stride_pix2);
t3 = vld1q_u16(pix2 + 7 * stride_pix2);
- v20 = vsubq_u16(r0, t0);
- v21 = vsubq_u16(r1, t1);
- v22 = vsubq_u16(r2, t2);
- v23 = vsubq_u16(r3, t3);
+ v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+ v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+ v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+ v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
SUMSUB_AB(v0, v1, v16, v17);
SUMSUB_AB(v2, v3, v18, v19);
@@ -234,8 +242,8 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons
static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2,
int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3)
{
- uint8x16_t r0, r1, r2, r3;
- uint8x16_t t0, t1, t2, t3;
+ uint16x8_t r0, r1, r2, r3;
+ uint16x8_t t0, t1, t2, t3;
int16x8_t v16, v17, v20, v21;
int16x8_t v18, v19, v22, v23;
@@ -249,11 +257,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui
t2 = vld1q_u16(pix2 + 2 * stride_pix2);
t3 = vld1q_u16(pix2 + 3 * stride_pix2);
-
- v16 = vsubq_u16((r0), (t0));
- v17 = vsubq_u16((r1), (t1));
- v18 = vsubq_u16((r2), (t2));
- v19 = vsubq_u16((r3), (t3));
+ v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+ v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+ v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+ v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8);
r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8);
@@ -265,11 +272,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui
t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8);
t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8);
-
- v20 = vsubq_u16(r0, t0);
- v21 = vsubq_u16(r1, t1);
- v22 = vsubq_u16(r2, t2);
- v23 = vsubq_u16(r3, t3);
+ v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0));
+ v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+ v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2));
+ v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3));
SUMSUB_AB(v0, v1, v16, v17);
SUMSUB_AB(v2, v3, v18, v19);
@@ -295,7 +301,10 @@ int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16
uint16x8_t r0 = vcombine_u16(r0_0, r0_1);
uint16x8_t r1 = vcombine_u16(r1_0, r1_1);
- return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1));
+ int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0));
+ int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1));
+
+ return _satd_4x4_neon(v0, v1);
}
@@ -316,10 +325,10 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16
i6 = vld1q_u16(pix1 + 3 * stride_pix1);
i7 = vld1q_u16(pix2 + 3 * stride_pix2);
- int16x8_t v0 = vsubq_u16(i0, i1);
- int16x8_t v1 = vsubq_u16(i2, i3);
- int16x8_t v2 = vsubq_u16(i4, i5);
- int16x8_t v3 = vsubq_u16(i6, i7);
+ int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1));
+ int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3));
+ int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5));
+ int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7));
return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
}
@@ -327,17 +336,21 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16
int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2)
{
- int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
+ uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0);
int16x8_t v0, v1, v2, v3;
- for (int offset = 0; offset <= 12; offset += 4) {
- _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3);
- v30 = vpadalq_u16(v30, v0);
- v30 = vpadalq_u16(v30, v1);
- v31 = vpadalq_u16(v31, v2);
- v31 = vpadalq_u16(v31, v3);
+
+ for (int offset = 0; offset <= 12; offset += 4)
+ {
+ _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1,
+ pix2 + offset * stride_pix2,stride_pix2,
+ v0, v1, v2, v3);
+ v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0));
+ v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1));
+ v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2));
+ v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3));
}
- return vaddvq_s32(vaddq_s32(v30, v31));
+ return vaddvq_u32(vaddq_u32(v30, v31));
}
#else //HIGH_BIT_DEPTH
@@ -360,16 +373,14 @@ static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uin
t2 = vld1q_u8(pix2 + 2 * stride_pix2);
t3 = vld1q_u8(pix2 + 3 * stride_pix2);
-
-
- v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0));
- v20 = vsubl_high_u8(r0, t0);
- v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1));
- v21 = vsubl_high_u8(r1, t1);
- v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2));
- v22 = vsubl_high_u8(r2, t2);
- v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3));
- v23 = vsubl_high_u8(r3, t3);
+ v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)));
+ v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0));
+ v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)));
+ v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1));
+ v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)));
+ v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2));
+ v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)));
+ v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3));
SUMSUB_AB(v0, v1, v16, v17);
SUMSUB_AB(v2, v3, v18, v19);
@@ -398,10 +409,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const
t2 = vld1_u8(pix2 + 2 * stride_pix2);
t3 = vld1_u8(pix2 + 3 * stride_pix2);
- v16 = vsubl_u8(r0, t0);
- v17 = vsubl_u8(r1, t1);
- v18 = vsubl_u8(r2, t2);
- v19 = vsubl_u8(r3, t3);
+ v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
+ v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
+ v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
+ v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
r0 = vld1_u8(pix1 + 4 * stride_pix1);
r1 = vld1_u8(pix1 + 5 * stride_pix1);
@@ -413,10 +424,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const
t2 = vld1_u8(pix2 + 6 * stride_pix2);
t3 = vld1_u8(pix2 + 7 * stride_pix2);
- v20 = vsubl_u8(r0, t0);
- v21 = vsubl_u8(r1, t1);
- v22 = vsubl_u8(r2, t2);
- v23 = vsubl_u8(r3, t3);
+ v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0));
+ v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1));
+ v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2));
+ v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3));
SUMSUB_AB(v0, v1, v16, v17);
@@ -432,7 +443,8 @@ int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t
uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2);
uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2);
- return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1));
+ return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)),
+ vreinterpretq_s16_u16(vsubl_u8(r1, t1)));
}
@@ -449,53 +461,53 @@ int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t
i6 = vld1_u8(pix1 + 3 * stride_pix1);
i7 = vld1_u8(pix2 + 3 * stride_pix2);
- int16x8_t v0 = vsubl_u8(i0, i1);
- int16x8_t v1 = vsubl_u8(i2, i3);
- int16x8_t v2 = vsubl_u8(i4, i5);
- int16x8_t v3 = vsubl_u8(i6, i7);
+ int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1));
+ int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3));
+ int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5));
+ int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7));
return _satd_4x8_8x4_end_neon(v0, v1, v2, v3);
}
int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2)
{
- int16x8_t v30, v31;
+ uint16x8_t v30, v31;
int16x8_t v0, v1, v2, v3;
+ uint16x8_t t0, t1;
_satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
- v30 = vaddq_s16(v0, v1);
- v31 = vaddq_s16(v2, v3);
+ v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
+ v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
_satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- v0 = vaddq_s16(v0, v1);
- v1 = vaddq_s16(v2, v3);
- v30 = vaddq_s16(v30, v0);
- v31 = vaddq_s16(v31, v1);
+ t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
+ t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ v30 = vaddq_u16(v30, t0);
+ v31 = vaddq_u16(v31, t1);
_satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- v0 = vaddq_s16(v0, v1);
- v1 = vaddq_s16(v2, v3);
- v30 = vaddq_s16(v30, v0);
- v31 = vaddq_s16(v31, v1);
+ t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
+ t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ v30 = vaddq_u16(v30, t0);
+ v31 = vaddq_u16(v31, t1);
_satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3);
- v0 = vaddq_s16(v0, v1);
- v1 = vaddq_s16(v2, v3);
- v30 = vaddq_s16(v30, v0);
- v31 = vaddq_s16(v31, v1);
-
- int32x4_t sum0 = vpaddlq_u16(v30);
- int32x4_t sum1 = vpaddlq_u16(v31);
- sum0 = vaddq_s32(sum0, sum1);
- return vaddvq_s32(sum0);
-
+ t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
+ t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+ v30 = vaddq_u16(v30, t0);
+ v31 = vaddq_u16(v31, t1);
+
+ uint32x4_t sum0 = vpaddlq_u16(v30);
+ uint32x4_t sum1 = vpaddlq_u16(v31);
+ sum0 = vaddq_u32(sum0, sum1);
+ return vaddvq_u32(sum0);
}
#endif //HIGH_BIT_DEPTH
#if HIGH_BIT_DEPTH
-typedef int32x4_t sa8d_out_type;
+typedef uint32x4_t sa8d_out_type;
#else
-typedef int16x8_t sa8d_out_type;
+typedef uint16x8_t sa8d_out_type;
#endif
static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
@@ -546,31 +558,30 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
transpose_2d_8h(v18, v22, v2, v6);
transpose_2d_8h(v19, v23, v3, v7);
+ uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16));
+ uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17));
+ uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18));
+ uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19));
+ uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20));
+ uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21));
+ uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22));
+ uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23));
- v16 = vabsq_s16(v16);
- v17 = vabsq_s16(v17);
- v18 = vabsq_s16(v18);
- v19 = vabsq_s16(v19);
- v20 = vabsq_s16(v20);
- v21 = vabsq_s16(v21);
- v22 = vabsq_s16(v22);
- v23 = vabsq_s16(v23);
-
- v16 = vmaxq_u16(v16, v20);
- v17 = vmaxq_u16(v17, v21);
- v18 = vmaxq_u16(v18, v22);
- v19 = vmaxq_u16(v19, v23);
+ uint16x8_t max0 = vmaxq_u16(abs0, abs4);
+ uint16x8_t max1 = vmaxq_u16(abs1, abs5);
+ uint16x8_t max2 = vmaxq_u16(abs2, abs6);
+ uint16x8_t max3 = vmaxq_u16(abs3, abs7);
#if HIGH_BIT_DEPTH
- out0 = vpaddlq_u16(v16);
- out1 = vpaddlq_u16(v17);
- out0 = vpadalq_u16(out0, v18);
- out1 = vpadalq_u16(out1, v19);
+ out0 = vpaddlq_u16(max0);
+ out1 = vpaddlq_u16(max1);
+ out0 = vpadalq_u16(out0, max2);
+ out1 = vpadalq_u16(out1, max3);
#else //HIGH_BIT_DEPTH
- out0 = vaddq_u16(v16, v17);
- out1 = vaddq_u16(v18, v19);
+ out0 = vaddq_u16(max0, max1);
+ out1 = vaddq_u16(max2, max3);
#endif //HIGH_BIT_DEPTH
@@ -643,41 +654,41 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
v23l = v3h;
v23h = v7h;
- v16l = vabsq_s32(v16l);
- v17l = vabsq_s32(v17l);
- v18l = vabsq_s32(v18l);
- v19l = vabsq_s32(v19l);
- v20l = vabsq_s32(v20l);
- v21l = vabsq_s32(v21l);
- v22l = vabsq_s32(v22l);
- v23l = vabsq_s32(v23l);
-
- v16h = vabsq_s32(v16h);
- v17h = vabsq_s32(v17h);
- v18h = vabsq_s32(v18h);
- v19h = vabsq_s32(v19h);
- v20h = vabsq_s32(v20h);
- v21h = vabsq_s32(v21h);
- v22h = vabsq_s32(v22h);
- v23h = vabsq_s32(v23h);
-
- v16l = vmaxq_u32(v16l, v20l);
- v17l = vmaxq_u32(v17l, v21l);
- v18l = vmaxq_u32(v18l, v22l);
- v19l = vmaxq_u32(v19l, v23l);
-
- v16h = vmaxq_u32(v16h, v20h);
- v17h = vmaxq_u32(v17h, v21h);
- v18h = vmaxq_u32(v18h, v22h);
- v19h = vmaxq_u32(v19h, v23h);
-
- v16l = vaddq_u32(v16l, v16h);
- v17l = vaddq_u32(v17l, v17h);
- v18l = vaddq_u32(v18l, v18h);
- v19l = vaddq_u32(v19l, v19h);
-
- out0 = vaddq_u32(v16l, v17l);
- out1 = vaddq_u32(v18l, v19l);
+ uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l));
+ uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l));
+ uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l));
+ uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l));
+ uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l));
+ uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l));
+ uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l));
+ uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l));
+
+ uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h));
+ uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h));
+ uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h));
+ uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h));
+ uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h));
+ uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h));
+ uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h));
+ uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h));
+
+ uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo);
+ uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo);
+ uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo);
+ uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo);
+
+ uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi);
+ uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi);
+ uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi);
+ uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi);
+
+ uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi);
+ uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi);
+ uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi);
+ uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi);
+
+ out0 = vaddq_u32(sum0, sum1);
+ out1 = vaddq_u32(sum2, sum3);
#endif
@@ -700,24 +711,19 @@ static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const
int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
- int16x8_t v30, v31;
int16x8_t v0, v1, v2, v3;
_satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3);
-#if !(HIGH_BIT_DEPTH)
- v30 = vaddq_u16(v0, v1);
- v31 = vaddq_u16(v2, v3);
+ uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1));
+ uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3));
+#if !(HIGH_BIT_DEPTH)
uint16x8_t sum = vaddq_u16(v30, v31);
- return vaddvq_s32(vpaddlq_u16(sum));
+ return vaddvq_u32(vpaddlq_u16(sum));
#else
-
- v30 = vaddq_u16(v0, v1);
- v31 = vaddq_u16(v2, v3);
-
- int32x4_t sum = vpaddlq_u16(v30);
+ uint32x4_t sum = vpaddlq_u16(v30);
sum = vpadalq_u16(sum, v31);
- return vaddvq_s32(sum);
+ return vaddvq_u32(sum);
#endif
}
@@ -732,10 +738,10 @@ int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pi
_sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
#if HIGH_BIT_DEPTH
- int32x4_t s = vaddq_u32(res0, res1);
+ uint32x4_t s = vaddq_u32(res0, res1);
return (vaddvq_u32(s) + 1) >> 1;
#else
- return (vaddlvq_s16(vaddq_u16(res0, res1)) + 1) >> 1;
+ return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1;
#endif
}
@@ -748,7 +754,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
int16x8_t v0, v1, v2, v3;
int16x8_t v20, v21, v22, v23;
sa8d_out_type res0, res1;
- int32x4_t v30, v31;
+ uint32x4_t v30, v31;
_sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
_sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
@@ -757,7 +763,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
v30 = vpaddlq_u16(res0);
v31 = vpaddlq_u16(res1);
#else
- v30 = vaddq_s32(res0, res1);
+ v30 = vaddq_u32(res0, res1);
#endif
_sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
@@ -767,7 +773,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
v30 = vpadalq_u16(v30, res0);
v31 = vpadalq_u16(v31, res1);
#else
- v31 = vaddq_s32(res0, res1);
+ v31 = vaddq_u32(res0, res1);
#endif
@@ -779,8 +785,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
v30 = vpadalq_u16(v30, res0);
v31 = vpadalq_u16(v31, res1);
#else
- v30 = vaddq_s32(v30, res0);
- v31 = vaddq_s32(v31, res1);
+ v30 = vaddq_u32(v30, res0);
+ v31 = vaddq_u32(v31, res1);
#endif
_sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
@@ -791,8 +797,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
v30 = vpadalq_u16(v30, res0);
v31 = vpadalq_u16(v31, res1);
#else
- v30 = vaddq_s32(v30, res0);
- v31 = vaddq_s32(v31, res1);
+ v30 = vaddq_u32(v30, res0);
+ v31 = vaddq_u32(v31, res1);
#endif
v30 = vaddq_u32(v30, v31);
@@ -840,19 +846,18 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
{
uint16x8_t p1 = vld1q_u16(pix1 + x);
uint16x8_t p2 = vld1q_u16(pix2 + x);
- vsum16_1 = vabaq_s16(vsum16_1, p1, p2);
-
+ vsum16_1 = vabaq_u16(vsum16_1, p1, p2);
}
if (lx & 4)
{
uint16x4_t p1 = vld1_u16(pix1 + x);
uint16x4_t p2 = vld1_u16(pix2 + x);
- sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
+ sum += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2));
x += 4;
}
if (lx >= 4)
{
- sum += vaddlvq_s16(vsum16_1);
+ sum += vaddlvq_u16(vsum16_1);
}
#else
@@ -924,10 +929,9 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
uint16x8_t p2 = vld1q_u16(pix2 + x);
uint16x8_t p3 = vld1q_u16(pix3 + x);
uint16x8_t p4 = vld1q_u16(pix4 + x);
- vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
- vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
- vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
-
+ vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
+ vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
+ vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
}
if (lx & 4)
{
@@ -935,16 +939,16 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
uint16x4_t p2 = vld1_u16(pix2 + x);
uint16x4_t p3 = vld1_u16(pix3 + x);
uint16x4_t p4 = vld1_u16(pix4 + x);
- res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2));
- res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3));
- res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4));
+ res[0] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2));
+ res[1] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p3));
+ res[2] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p4));
x += 4;
}
if (lx >= 4)
{
- res[0] += vaddlvq_s16(vsum16_0);
- res[1] += vaddlvq_s16(vsum16_1);
- res[2] += vaddlvq_s16(vsum16_2);
+ res[0] += vaddlvq_u16(vsum16_0);
+ res[1] += vaddlvq_u16(vsum16_1);
+ res[2] += vaddlvq_u16(vsum16_2);
}
#else
@@ -1009,7 +1013,8 @@ template<int lx, int ly>
void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5,
intptr_t frefstride, int32_t *res)
{
- int32x4_t result = {0};
+ uint32x4_t result = vdupq_n_u32(0);
+
for (int y = 0; y < ly; y++)
{
int x = 0;
@@ -1025,14 +1030,14 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]);
uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]);
uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]);
- vsum16_0 = vabaq_s16(vsum16_0, p1.val[0], p2.val[0]);
- vsum16_1 = vabaq_s16(vsum16_1, p1.val[0], p3.val[0]);
- vsum16_2 = vabaq_s16(vsum16_2, p1.val[0], p4.val[0]);
- vsum16_3 = vabaq_s16(vsum16_3, p1.val[0], p5.val[0]);
- vsum16_0 = vabaq_s16(vsum16_0, p1.val[1], p2.val[1]);
- vsum16_1 = vabaq_s16(vsum16_1, p1.val[1], p3.val[1]);
- vsum16_2 = vabaq_s16(vsum16_2, p1.val[1], p4.val[1]);
- vsum16_3 = vabaq_s16(vsum16_3, p1.val[1], p5.val[1]);
+ vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]);
+ vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]);
+ vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]);
+ vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]);
+ vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]);
+ vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]);
+ vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]);
+ vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]);
}
if (lx & 8)
{
@@ -1041,10 +1046,10 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
uint16x8_t p3 = vld1q_u16(pix3 + x);
uint16x8_t p4 = vld1q_u16(pix4 + x);
uint16x8_t p5 = vld1q_u16(pix5 + x);
- vsum16_0 = vabaq_s16(vsum16_0, p1, p2);
- vsum16_1 = vabaq_s16(vsum16_1, p1, p3);
- vsum16_2 = vabaq_s16(vsum16_2, p1, p4);
- vsum16_3 = vabaq_s16(vsum16_3, p1, p5);
+ vsum16_0 = vabaq_u16(vsum16_0, p1, p2);
+ vsum16_1 = vabaq_u16(vsum16_1, p1, p3);
+ vsum16_2 = vabaq_u16(vsum16_2, p1, p4);
+ vsum16_3 = vabaq_u16(vsum16_3, p1, p5);
x += 8;
}
if (lx & 4)
@@ -1052,15 +1057,17 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
/* This is equivalent to getting the absolute difference of pix1[x] with each of
* pix2 - pix5, then summing across the vector (4 values each) and adding the
* result to result. */
- uint16x8_t p1 = vreinterpretq_s16_u64(
- vld1q_dup_u64((uint64_t *)&pix1[x]));
- uint16x8_t p2_3 = vcombine_s16(vld1_u16(pix2 + x), vld1_u16(pix3 + x));
- uint16x8_t p4_5 = vcombine_s16(vld1_u16(pix4 + x), vld1_u16(pix5 + x));
+ uint16x8_t p1 = vreinterpretq_u16_u64(
+ vld1q_dup_u64((uint64_t *)&pix1[x]));
+ uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x),
+ vld1_u16(pix3 + x));
+ uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x),
+ vld1_u16(pix5 + x));
uint16x8_t a = vabdq_u16(p1, p2_3);
uint16x8_t b = vabdq_u16(p1, p4_5);
- result = vpadalq_s16(result, vpaddq_s16(a, b));
+ result = vpadalq_u16(result, vpaddq_u16(a, b));
x += 4;
}
if (lx >= 4)
@@ -1105,7 +1112,7 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
}
if (lx & 4)
{
- uint8x16_t p1 = vreinterpretq_u32_u8(
+ uint8x16_t p1 = vreinterpretq_u8_u32(
vld1q_dup_u32((uint32_t *)&pix1[x]));
uint32x4_t p_x4 = vdupq_n_u32(0);
@@ -1114,11 +1121,11 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2);
p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3);
- uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4));
- uint16x8_t sum2 = vabdl_high_u8(p1, p_x4);
+ uint16x8_t sum = vabdl_u8(vget_low_u8(p1),
+ vget_low_u8(vreinterpretq_u8_u32(p_x4)));
+ uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4));
- uint16x8_t a = vpaddq_u16(sum, sum2);
- result = vpadalq_u16(result, a);
+ result = vpadalq_u16(result, vpaddq_u16(sum, sum2));
}
if (lx >= 4)
{
@@ -1143,7 +1150,8 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
pix4 += frefstride;
pix5 += frefstride;
}
- vst1q_s32(res, result);
+
+ vst1q_s32(res, vreinterpretq_s32_u32(result));
}
@@ -1164,17 +1172,17 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st
{
// We have to cast to the 'real' type so that this block
// will compile for both low and high bitdepth.
- const uint16_t *p1_u16 = (const uint16_t*)pix1;
- const uint16_t *p2_u16 = (const uint16_t*)pix2;
- tmp = vsubq_u16(vld1q_u16(p1_u16 + x), vld1q_u16(p2_u16 + x));
+ uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x);
+ uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x);
+ tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2));
}
else if (sizeof(T1) == 1 && sizeof(T2) == 1)
{
// We have to cast to the 'real' type so that this block
// will compile for both low and high bitdepth.
- const uint8_t *p1_u8 = (const uint8_t*)pix1;
- const uint8_t *p2_u8 = (const uint8_t*)pix2;
- tmp = vsubl_u8(vld1_u8(p1_u8 + x), vld1_u8(p2_u8 + x));
+ uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x);
+ uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x);
+ tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2));
}
else
{
@@ -1191,17 +1199,18 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st
if (sizeof(T1) == 2 && sizeof(T2) == 2)
{
- int32x4_t vsum = vaddq_u32(vsum1, vsum2);;
- sum += vaddvq_u32(vsum);
- vsum1 = vsum2 = vdupq_n_u16(0);
+ int32x4_t vsum = vaddq_s32(vsum1, vsum2);
+ sum += vaddvq_s32(vsum);
+ vsum1 = vsum2 = vdupq_n_s32(0);
}
pix1 += stride_pix1;
pix2 += stride_pix2;
}
- int32x4_t vsum = vaddq_u32(vsum1, vsum2);
- return sum + vaddvq_u32(vsum);
+ int32x4_t vsum = vaddq_s32(vsum1, vsum2);
+
+ return sum + vaddvq_s32(vsum);
}
@@ -1214,9 +1223,10 @@ void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t st
for (; (x + 8) <= bx; x += 8)
{
#if HIGH_BIT_DEPTH
- vst1q_s16(a + x, vld1q_u16(b + x));
+ vst1q_s16(a + x, vreinterpretq_s16_u16(vld1q_u16(b + x)));
#else
- vst1q_s16(a + x, vmovl_u8(vld1_u8(b + x)));
+ int16x8_t in = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b + x)));
+ vst1q_s16(a + x, in);
#endif
}
for (; x < bx; x++)
@@ -1283,9 +1293,11 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixe
for (; (x + 8) <= bx; x += 8)
{
#if HIGH_BIT_DEPTH
- vst1q_s16(a + x, vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x)));
+ uint16x8_t diff = vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x));
+ vst1q_s16(a + x, vreinterpretq_s16_u16(diff));
#else
- vst1q_s16(a + x, vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x)));
+ uint16x8_t diff = vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x));
+ vst1q_s16(a + x, vreinterpretq_s16_u16(diff));
#endif
}
for (; x < bx; x++)
@@ -1312,13 +1324,13 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_
int16x8_t b1e = vld1q_s16(b1 + x);
int16x8_t b0e;
#if HIGH_BIT_DEPTH
- b0e = vld1q_u16(b0 + x);
+ b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x));
t = vaddq_s16(b0e, b1e);
t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(a + x, t);
+ vst1q_u16(a + x, vreinterpretq_u16_s16(t));
#else
- b0e = vmovl_u8(vld1_u8(b0 + x));
+ b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x)));
t = vaddq_s16(b0e, b1e);
vst1_u8(a + x, vqmovun_s16(t));
#endif
@@ -1357,11 +1369,12 @@ void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t
t2 = vaddq_s32(t2, addon);
t1 = vshrq_n_s32(t1, shiftNum);
t2 = vshrq_n_s32(t2, shiftNum);
- int16x8_t t = vuzp1q_s16(t1, t2);
+ int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1),
+ vreinterpretq_s16_s32(t2));
#if HIGH_BIT_DEPTH
t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(dst + x, t);
+ vst1q_u16(dst + x, vreinterpretq_u16_s16(t));
#else
vst1_u8(dst + x, vqmovun_s16(t));
#endif
@@ -1393,9 +1406,9 @@ void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t
uint16x8_t t = vrhaddq_u16(in0, in1);
vst1q_u16(dst + x, t);
#else
- int16x8_t in0 = vmovl_u8(vld1_u8(src0 + x));
- int16x8_t in1 = vmovl_u8(vld1_u8(src1 + x));
- int16x8_t t = vrhaddq_s16(in0, in1);
+ uint16x8_t in0 = vmovl_u8(vld1_u8(src0 + x));
+ uint16x8_t in1 = vmovl_u8(vld1_u8(src1 + x));
+ uint16x8_t t = vrhaddq_u16(in0, in1);
vst1_u8(dst + x, vmovn_u16(t));
#endif
}
@@ -1440,25 +1453,27 @@ uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
{
uint32_t sum = 0, sqr = 0;
- int32x4_t vsqr = vdupq_n_s32(0);
+ uint32x4_t vsqr = vdupq_n_u32(0);
+
for (int y = 0; y < size; y++)
{
int x = 0;
- int16x8_t vsum = vdupq_n_s16(0);
+ uint16x8_t vsum = vdupq_n_u16(0);
for (; (x + 8) <= size; x += 8)
{
- int16x8_t in;
+ uint16x8_t in;
in = vmovl_u8(vld1_u8(pix + x));
vsum = vaddq_u16(vsum, in);
- vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in));
- vsqr = vmlal_high_s16(vsqr, in, in);
+ vsqr = vmlal_u16(vsqr, vget_low_u16(in), vget_low_u16(in));
+ vsqr = vmlal_high_u16(vsqr, in, in);
}
for (; x < size; x++)
{
sum += pix[x];
sqr += pix[x] * pix[x];
}
- sum += vaddvq_s16(vsum);
+
+ sum += vaddvq_u16(vsum);
pix += i_stride;
}
@@ -1474,7 +1489,7 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
int x = 0;
for (; (x + 8) < blockSize; x += 8)
{
- int16x8_t vfenc, vpred;
+ uint16x8_t vfenc, vpred;
#if HIGH_BIT_DEPTH
vfenc = vld1q_u16(fenc + x);
vpred = vld1q_u16(pred + x);
@@ -1482,7 +1497,8 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i
vfenc = vmovl_u8(vld1_u8(fenc + x));
vpred = vmovl_u8(vld1_u8(pred + x));
#endif
- vst1q_s16(residual + x, vsubq_s16(vfenc, vpred));
+ int16x8_t res = vreinterpretq_s16_u16(vsubq_u16(vfenc, vpred));
+ vst1q_s16(residual + x, res);
}
for (; x < blockSize; x++)
{
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Refactor-types-and-conversions-in-pixel-prim.patch
Type: text/x-patch
Size: 39012 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/911aaa17/attachment-0001.bin>
More information about the x265-devel
mailing list