[x265] [PATCH] AArch64: Reuse code for sse_pp_neon and sse_pp_neon_dotprod
Karam Singh
karam.singh at multicorewareinc.com
Fri Aug 23 04:24:17 UTC 2024
Pushed to master branch.
*__________________________*
*Karam Singh*
*Ph.D. IIT Guwahati*
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089
On Thu, Aug 22, 2024 at 3:34 PM Hari Limaye <hari.limaye at arm.com> wrote:
> Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for
> block sizes of width 32 to dispatch to shared functions, to reduce code
> size.
> ---
> source/common/aarch64/ssd-a.S | 16 ++++++++++------
> source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------
> 2 files changed, 20 insertions(+), 12 deletions(-)
>
> diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
> index 4a5e80d49..a66d68617 100644
> --- a/source/common/aarch64/ssd-a.S
> +++ b/source/common/aarch64/ssd-a.S
> @@ -101,13 +101,11 @@ SSE_PP_16xN 16
> SSE_PP_16xN 32
>
> // Loop unrolled to process 4 rows per iteration.
> -.macro SSE_PP_32xN h
> -function PFX(pixel_sse_pp_32x\h\()_neon)
> - mov w12, #(\h / 4)
> +function PFX(pixel_sse_pp_32xh_neon), export=0
> movi v0.4s, #0
> movi v1.4s, #0
> -.Loop_sse_pp_32_x\h:
> - sub w12, w12, #1
> +.Loop_sse_pp_32xh:
> + sub w4, w4, #1
> .rept 4
> ld1 {v16.16b,v17.16b}, [x0], x1
> ld1 {v18.16b,v19.16b}, [x2], x3
> @@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon)
> uadalp v0.4s, v22.8h
> uadalp v1.4s, v23.8h
> .endr
> - cbnz w12, .Loop_sse_pp_32_x\h
> + cbnz w4, .Loop_sse_pp_32xh
> add v0.4s, v0.4s, v1.4s
> ret_v0_w0
> endfunc
> +
> +.macro SSE_PP_32xN h
> +function PFX(pixel_sse_pp_32x\h\()_neon)
> + mov w4, \h / 4
> + b PFX(pixel_sse_pp_32xh_neon)
> +endfunc
> .endm
>
> SSE_PP_32xN 32
> diff --git a/source/common/aarch64/ssd-neon-dotprod.S
> b/source/common/aarch64/ssd-neon-dotprod.S
> index 4df4fb35b..044412fba 100644
> --- a/source/common/aarch64/ssd-neon-dotprod.S
> +++ b/source/common/aarch64/ssd-neon-dotprod.S
> @@ -110,13 +110,11 @@ SSE_PP_16xN 16
> SSE_PP_16xN 32
>
> // Loop unrolled to process 4 rows per iteration.
> -.macro SSE_PP_32xN h
> -function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
> - mov w12, #(\h / 4)
> +function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
> movi v0.4s, #0
> movi v1.4s, #0
> -.Loop_sse_pp_32_x\h:
> - sub w12, w12, #1
> +.Loop_sse_pp_32xh:
> + sub w4, w4, #1
> .rept 4
> ld1 {v16.16b,v17.16b}, [x0], x1
> ld1 {v18.16b,v19.16b}, [x2], x3
> @@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
> uabd v3.16b, v17.16b, v19.16b
> udot v1.4s, v3.16b, v3.16b
> .endr
> - cbnz w12, .Loop_sse_pp_32_x\h
> + cbnz w4, .Loop_sse_pp_32xh
> add v0.4s, v0.4s, v1.4s
> addv s0, v0.4s
> fmov w0, s0
> ret
> endfunc
> +
> +.macro SSE_PP_32xN h
> +function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
> + mov w4, \h / 4
> + b PFX(pixel_sse_pp_32xh_neon_dotprod)
> +endfunc
> .endm
>
> SSE_PP_32xN 32
> --
> 2.42.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240823/9794a491/attachment-0001.htm>
More information about the x265-devel
mailing list