[x265] [PATCH] AArch64: Reuse code for sse_pp_neon and sse_pp_neon_dotprod
Hari Limaye
hari.limaye at arm.com
Thu Aug 22 10:04:36 UTC 2024
Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for
block sizes of width 32 to dispatch to shared functions, to reduce code
size.
---
source/common/aarch64/ssd-a.S | 16 ++++++++++------
source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------
2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 4a5e80d49..a66d68617 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -101,13 +101,11 @@ SSE_PP_16xN 16
SSE_PP_16xN 32
// Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon)
- mov w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon), export=0
movi v0.4s, #0
movi v1.4s, #0
-.Loop_sse_pp_32_x\h:
- sub w12, w12, #1
+.Loop_sse_pp_32xh:
+ sub w4, w4, #1
.rept 4
ld1 {v16.16b,v17.16b}, [x0], x1
ld1 {v18.16b,v19.16b}, [x2], x3
@@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon)
uadalp v0.4s, v22.8h
uadalp v1.4s, v23.8h
.endr
- cbnz w12, .Loop_sse_pp_32_x\h
+ cbnz w4, .Loop_sse_pp_32xh
add v0.4s, v0.4s, v1.4s
ret_v0_w0
endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon)
+ mov w4, \h / 4
+ b PFX(pixel_sse_pp_32xh_neon)
+endfunc
.endm
SSE_PP_32xN 32
diff --git a/source/common/aarch64/ssd-neon-dotprod.S b/source/common/aarch64/ssd-neon-dotprod.S
index 4df4fb35b..044412fba 100644
--- a/source/common/aarch64/ssd-neon-dotprod.S
+++ b/source/common/aarch64/ssd-neon-dotprod.S
@@ -110,13 +110,11 @@ SSE_PP_16xN 16
SSE_PP_16xN 32
// Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
- mov w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
movi v0.4s, #0
movi v1.4s, #0
-.Loop_sse_pp_32_x\h:
- sub w12, w12, #1
+.Loop_sse_pp_32xh:
+ sub w4, w4, #1
.rept 4
ld1 {v16.16b,v17.16b}, [x0], x1
ld1 {v18.16b,v19.16b}, [x2], x3
@@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
uabd v3.16b, v17.16b, v19.16b
udot v1.4s, v3.16b, v3.16b
.endr
- cbnz w12, .Loop_sse_pp_32_x\h
+ cbnz w4, .Loop_sse_pp_32xh
add v0.4s, v0.4s, v1.4s
addv s0, v0.4s
fmov w0, s0
ret
endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
+ mov w4, \h / 4
+ b PFX(pixel_sse_pp_32xh_neon_dotprod)
+endfunc
.endm
SSE_PP_32xN 32
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Reuse-code-for-sse_pp_neon-and-sse_pp_neon_d.patch
Type: text/x-patch
Size: 3105 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240822/952f5a50/attachment.bin>
More information about the x265-devel
mailing list