<div dir="ltr">Pushed to master branch. <br clear="all"><div><div dir="ltr" class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div><b>__________________________</b></div><div><b>Karam Singh</b></div><div><b>Ph.D. IIT Guwahati</b></div><div><font size="1">Senior Software (Video Coding) Engineer  </font></div><div><font size="1">Mobile: +91 8011279030</font></div><div><font size="1">Block 9A, 6th floor, DLF Cyber City</font></div><div><font size="1">Manapakkam, Chennai 600 089</font></div></div></div></div><br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Aug 22, 2024 at 3:34 PM Hari Limaye <<a href="mailto:hari.limaye@arm.com">hari.limaye@arm.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for<br>
block sizes of width 32 to dispatch to shared functions, to reduce code<br>
size.<br>
---<br>
 source/common/aarch64/ssd-a.S            | 16 ++++++++++------<br>
 source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------<br>
 2 files changed, 20 insertions(+), 12 deletions(-)<br>
<br>
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S<br>
index 4a5e80d49..a66d68617 100644<br>
--- a/source/common/aarch64/ssd-a.S<br>
+++ b/source/common/aarch64/ssd-a.S<br>
@@ -101,13 +101,11 @@ SSE_PP_16xN 16<br>
 SSE_PP_16xN 32<br>
<br>
 // Loop unrolled to process 4 rows per iteration.<br>
-.macro SSE_PP_32xN h<br>
-function PFX(pixel_sse_pp_32x\h\()_neon)<br>
-    mov             w12, #(\h / 4)<br>
+function PFX(pixel_sse_pp_32xh_neon), export=0<br>
     movi            v0.4s, #0<br>
     movi            v1.4s, #0<br>
-.Loop_sse_pp_32_x\h:<br>
-    sub             w12, w12, #1<br>
+.Loop_sse_pp_32xh:<br>
+    sub             w4, w4, #1<br>
 .rept 4<br>
     ld1             {v16.16b,v17.16b}, [x0], x1<br>
     ld1             {v18.16b,v19.16b}, [x2], x3<br>
@@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon)<br>
     uadalp          v0.4s, v22.8h<br>
     uadalp          v1.4s, v23.8h<br>
 .endr<br>
-    cbnz            w12, .Loop_sse_pp_32_x\h<br>
+    cbnz            w4, .Loop_sse_pp_32xh<br>
     add             v0.4s, v0.4s, v1.4s<br>
     ret_v0_w0<br>
 endfunc<br>
+<br>
+.macro SSE_PP_32xN h<br>
+function PFX(pixel_sse_pp_32x\h\()_neon)<br>
+    mov             w4, \h / 4<br>
+    b               PFX(pixel_sse_pp_32xh_neon)<br>
+endfunc<br>
 .endm<br>
<br>
 SSE_PP_32xN 32<br>
diff --git a/source/common/aarch64/ssd-neon-dotprod.S b/source/common/aarch64/ssd-neon-dotprod.S<br>
index 4df4fb35b..044412fba 100644<br>
--- a/source/common/aarch64/ssd-neon-dotprod.S<br>
+++ b/source/common/aarch64/ssd-neon-dotprod.S<br>
@@ -110,13 +110,11 @@ SSE_PP_16xN 16<br>
 SSE_PP_16xN 32<br>
<br>
 // Loop unrolled to process 4 rows per iteration.<br>
-.macro SSE_PP_32xN h<br>
-function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)<br>
-    mov             w12, #(\h / 4)<br>
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0<br>
     movi            v0.4s, #0<br>
     movi            v1.4s, #0<br>
-.Loop_sse_pp_32_x\h:<br>
-    sub             w12, w12, #1<br>
+.Loop_sse_pp_32xh:<br>
+    sub             w4, w4, #1<br>
 .rept 4<br>
     ld1             {v16.16b,v17.16b}, [x0], x1<br>
     ld1             {v18.16b,v19.16b}, [x2], x3<br>
@@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)<br>
     uabd            v3.16b, v17.16b, v19.16b<br>
     udot            v1.4s, v3.16b, v3.16b<br>
 .endr<br>
-    cbnz            w12, .Loop_sse_pp_32_x\h<br>
+    cbnz            w4, .Loop_sse_pp_32xh<br>
     add             v0.4s, v0.4s, v1.4s<br>
     addv            s0, v0.4s<br>
     fmov            w0, s0<br>
     ret<br>
 endfunc<br>
+<br>
+.macro SSE_PP_32xN h<br>
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)<br>
+    mov             w4, \h / 4<br>
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)<br>
+endfunc<br>
 .endm<br>
<br>
 SSE_PP_32xN 32<br>
-- <br>
2.42.1<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div>