[x265] [PATCH 2/5] AArch64: Add Neon implementation of pelFilterLumaStrong_H
chen
chenm003 at 163.com
Wed Feb 12 06:55:07 UTC 2025
Suggest replace assert() by X265_CHECK, other looks good.
At 2025-02-11 23:19:58, "Micro Daryl Robles" <microdaryl.robles at arm.com> wrote:
>Relative performance compared to scalar C:
>
> Neoverse N1: 3.67x
> Neoverse N2: 2.57x
> Neoverse V1: 2.31x
> Neoverse V2: 2.35x
>---
> source/common/aarch64/loopfilter-prim.cpp | 75 +++++++++++++++++++++++
> 1 file changed, 75 insertions(+)
>
>diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp
>index 9242c859e..7fef25cbb 100644
>--- a/source/common/aarch64/loopfilter-prim.cpp
>+++ b/source/common/aarch64/loopfilter-prim.cpp
>@@ -1,5 +1,6 @@
> #include "common.h"
> #include "loopfilter-prim.h"
>+#include "mem-neon.h"
>
> #define PIXEL_MIN 0
>
>@@ -311,6 +312,79 @@ void pelFilterLumaStrong_V_neon(pixel *src, intptr_t srcStep, intptr_t offset,
> }
> }
>
>+void pelFilterLumaStrong_H_neon(pixel *src, intptr_t srcStep, intptr_t offset,
>+ int32_t tcP, int32_t tcQ)
>+{
>+ assert(UNIT_SIZE == 4 && srcStep == 1);
>+ (void)srcStep;
>+
>+ const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ));
>+ const int16x8_t neg_tc_vec = vnegq_s16(tc_vec);
>+
>+ uint8x8_t m0 = vld1_u8(src - 4 * offset);
>+ uint8x8_t m1 = vld1_u8(src - 3 * offset);
>+ uint8x8_t m2 = vld1_u8(src - 2 * offset);
>+ uint8x8_t m3 = vld1_u8(src - 1 * offset);
>+ uint8x8_t m4 = vld1_u8(src - 0 * offset);
>+ uint8x8_t m5 = vld1_u8(src + 1 * offset);
>+ uint8x8_t m6 = vld1_u8(src + 2 * offset);
>+ uint8x8_t m7 = vld1_u8(src + 3 * offset);
>+
>+ uint8x8_t m12 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m2)));
>+ uint8x8_t m23 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m3)));
>+ uint8x8_t m34 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m3), vreinterpret_u32_u8(m4)));
>+ uint8x8_t m45 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m4), vreinterpret_u32_u8(m5)));
>+ uint8x8_t m56 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m5), vreinterpret_u32_u8(m6)));
>+
>+ // src[-1 * offset], src[0 * offset]
>+ uint16x8_t p0 = vaddl_u8(m23, m34);
>+ p0 = vaddw_u8(p0, m45);
>+ uint16x8_t t0 = vshlq_n_u16(p0, 1);
>+ uint16x8_t t1 = vaddl_u8(m12, m56);
>+ uint16x8_t t01 = vaddq_u16(t0, t1);
>+ t01 = vrshrq_n_u16(t01, 3);
>+ t01 = vsubw_u8(t01, m34);
>+ t01 = vreinterpretq_u16_s16(
>+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t01))));
>+ uint8x8_t d01 = vmovn_u16(t01);
>+ d01 = vadd_u8(d01, m34);
>+ store_u8x4_strided_xN<2>(&src[-1 * offset], 1 * offset, &d01);
>+
>+ uint8x8_t m16 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m6)));
>+ uint8x8_t m25 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m5)));
>+
>+ // src[-2 * offset], src[1 * offset]
>+ uint16x8_t p1 = vaddw_u8(p0, m16);
>+ uint16x8_t t23 = vrshrq_n_u16(p1, 2);
>+ t23 = vsubw_u8(t23, m25);
>+ t23 = vreinterpretq_u16_s16(
>+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t23))));
>+ uint8x8_t d23 = vmovn_u16(t23);
>+ d23 = vadd_u8(d23, m25);
>+ store_u8x4_strided_xN<2>(&src[-2 * offset], 3 * offset, &d23);
>+
>+ uint8x8_t m07 =
>+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m0), vreinterpret_u32_u8(m7)));
>+
>+ // src[-3 * offset], src[2 * offset]
>+ uint16x8_t p2 = vaddl_u8(m07, m16);
>+ uint16x8_t t45 = vmlaq_n_u16(p1, p2, 2);
>+ t45 = vrshrq_n_u16(t45, 3);
>+ t45 = vsubw_u8(t45, m16);
>+ t45 = vreinterpretq_u16_s16(
>+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t45))));
>+ uint8x8_t d45 = vmovn_u16(t45);
>+ d45 = vadd_u8(d45, m16);
>+ store_u8x4_strided_xN<2>(&src[-3 * offset], 5 * offset, &d45);
>+}
>+
> } // namespace
>
> namespace X265_NS
>@@ -328,6 +402,7 @@ void setupLoopFilterPrimitives_neon(EncoderPrimitives &p)
> p.sign = calSign_neon;
>
> p.pelFilterLumaStrong[0] = pelFilterLumaStrong_V_neon;
>+ p.pelFilterLumaStrong[1] = pelFilterLumaStrong_H_neon;
> }
>
>
>--
>2.34.1
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250212/483cd71e/attachment-0001.htm>
More information about the x265-devel
mailing list