[x264-devel] [Git][videolan/x264][master] aarch64: Improve scheduling in sad_x3/sad_x4

Martin Storsjö (@mstorsjo) gitlab at videolan.org
Thu Nov 2 21:24:58 UTC 2023



Martin Storsjö pushed to branch master at VideoLAN / x264


Commits:
4664f5aa by Martin Storsjö at 2023-11-02T13:27:08+02:00
aarch64: Improve scheduling in sad_x3/sad_x4

               Cortex A53    A72    A73
8 bpc:
Before:
sad_x3_4x4_neon:      580    303    204
sad_x3_4x8_neon:     1065    516    323
sad_x3_8x4_neon:      668    262    282
sad_x3_8x8_neon:     1238    454    471
sad_x3_8x16_neon:    2378    842    847
sad_x3_16x8_neon:    2136    738    776
sad_x3_16x16_neon:   4162   1378   1463
After:
sad_x3_4x4_neon:      477    298    206
sad_x3_4x8_neon:      842    515    327
sad_x3_8x4_neon:      603    260    279
sad_x3_8x8_neon:     1110    451    464
sad_x3_8x16_neon:    2125    841    843
sad_x3_16x8_neon:    2124    730    766
sad_x3_16x16_neon:   4145   1370   1434

10 bpc:
Before:
sad_x3_4x4_neon:      632    247    254
sad_x3_4x8_neon:     1162    419    443
sad_x3_8x4_neon:      890    358    416
sad_x3_8x8_neon:     1670    632    759
sad_x3_8x16_neon:    3230   1179   1458
sad_x3_16x8_neon:    3070   1209   1403
sad_x3_16x16_neon:   6030   2333   2699

After:
sad_x3_4x4_neon:      522    253    255
sad_x3_4x8_neon:      932    443    431
sad_x3_8x4_neon:      880    354    406
sad_x3_8x8_neon:     1660    626    736
sad_x3_8x16_neon:    3220   1170   1397
sad_x3_16x8_neon:    3060   1184   1362
sad_x3_16x16_neon:   6020   2272   2579

Thus, this is around a 20-25% speedup on Cortex A53 for the small
sizes (much smaller difference for bigger sizes though), while it
doesn't make much of a difference at all (mostly within measurement
noise) for the out-of-order cores (A72 and A73).

- - - - -


1 changed file:

- common/aarch64/pixel-a.S


Changes:

=====================================
common/aarch64/pixel-a.S
=====================================
@@ -134,16 +134,18 @@ endfunc
     ld1        {v1.s}[0], [x1], x5
     ld1        {v0.s}[1], [x0], x7
     ld1        {v1.s}[1], [x1], x5
-    \first      v16.8h,  v1.8b,  v0.8b
     ld1        {v2.s}[0], [x2], x5
     ld1        {v2.s}[1], [x2], x5
-    \first      v17.8h,  v2.8b,  v0.8b
+    \first      v16.8h,  v1.8b,  v0.8b
     ld1        {v3.s}[0], [x3], x5
     ld1        {v3.s}[1], [x3], x5
-    \first      v18.8h,  v3.8b,  v0.8b
+    \first      v17.8h,  v2.8b,  v0.8b
 .if \x == 4
     ld1        {v4.s}[0], [x4], x5
     ld1        {v4.s}[1], [x4], x5
+.endif
+    \first      v18.8h,  v3.8b,  v0.8b
+.if \x == 4
     \first      v19.8h,  v4.8b,  v0.8b
 .endif
 .endm
@@ -151,54 +153,58 @@ endfunc
 .macro SAD_X_8 x, first=uabal
     ld1        {v0.8b}, [x0], x7
     ld1        {v1.8b}, [x1], x5
-    \first      v16.8h,  v1.8b,  v0.8b
     ld1        {v2.8b}, [x2], x5
-    ld1        {v5.8b}, [x0], x7
-    \first      v17.8h,  v2.8b,  v0.8b
+    \first      v16.8h,  v1.8b,  v0.8b
     ld1        {v3.8b}, [x3], x5
+    \first      v17.8h,  v2.8b,  v0.8b
+    ld1        {v5.8b}, [x0], x7
     ld1        {v1.8b}, [x1], x5
     \first      v18.8h,  v3.8b,  v0.8b
-    uabal       v16.8h,  v1.8b,  v5.8b
     ld1        {v2.8b}, [x2], x5
+    uabal       v16.8h,  v1.8b,  v5.8b
     ld1        {v3.8b}, [x3], x5
     uabal       v17.8h,  v2.8b,  v5.8b
-    uabal       v18.8h,  v3.8b,  v5.8b
 .if \x == 4
     ld1        {v4.8b}, [x4], x5
+    ld1        {v1.8b}, [x4], x5
+.endif
+    uabal       v18.8h,  v3.8b,  v5.8b
+.if \x == 4
     \first      v19.8h,  v4.8b,  v0.8b
-    ld1        {v4.8b}, [x4], x5
-    uabal       v19.8h,  v4.8b,  v5.8b
+    uabal       v19.8h,  v1.8b,  v5.8b
 .endif
 .endm
 
 .macro SAD_X_16 x, first=uabal
     ld1        {v0.16b}, [x0], x7
     ld1        {v1.16b}, [x1], x5
+    ld1        {v2.16b}, [x2], x5
     \first      v16.8h,  v1.8b,  v0.8b
     \first\()2  v20.8h,  v1.16b, v0.16b
-    ld1        {v2.16b}, [x2], x5
-    ld1        {v5.16b}, [x0], x7
+    ld1        {v3.16b}, [x3], x5
     \first      v17.8h,  v2.8b,  v0.8b
     \first\()2  v21.8h,  v2.16b, v0.16b
-    ld1        {v3.16b}, [x3], x5
+    ld1        {v5.16b}, [x0], x7
     ld1        {v1.16b}, [x1], x5
     \first      v18.8h,  v3.8b,  v0.8b
     \first\()2  v22.8h,  v3.16b, v0.16b
+    ld1        {v2.16b}, [x2], x5
     uabal       v16.8h,  v1.8b,  v5.8b
     uabal2      v20.8h,  v1.16b, v5.16b
-    ld1        {v2.16b}, [x2], x5
     ld1        {v3.16b}, [x3], x5
     uabal       v17.8h,  v2.8b,  v5.8b
     uabal2      v21.8h,  v2.16b, v5.16b
+.if \x == 4
+    ld1        {v4.16b}, [x4], x5
+    ld1        {v1.16b}, [x4], x5
+.endif
     uabal       v18.8h,  v3.8b,  v5.8b
     uabal2      v22.8h,  v3.16b, v5.16b
 .if \x == 4
-    ld1        {v4.16b}, [x4], x5
     \first      v19.8h,  v4.8b,  v0.8b
     \first\()2  v23.8h,  v4.16b, v0.16b
-    ld1        {v4.16b}, [x4], x5
-    uabal       v19.8h,  v4.8b,  v5.8b
-    uabal2      v23.8h,  v4.16b, v5.16b
+    uabal       v19.8h,  v1.8b,  v5.8b
+    uabal2      v23.8h,  v1.16b, v5.16b
 .endif
 .endm
 
@@ -1468,16 +1474,18 @@ endfunc
     ld1         {v1.d}[0], [x1], x5
     ld1         {v0.d}[1], [x0], x7
     ld1         {v1.d}[1], [x1], x5
-    \first      v16.8h, v1.8h, v0.8h
     ld1         {v2.d}[0], [x2], x5
     ld1         {v2.d}[1], [x2], x5
-    \first      v17.8h, v2.8h, v0.8h
+    \first      v16.8h, v1.8h, v0.8h
     ld1         {v3.d}[0], [x3], x5
     ld1         {v3.d}[1], [x3], x5
-    \first      v18.8h, v3.8h, v0.8h
+    \first      v17.8h, v2.8h, v0.8h
 .if \x == 4
     ld1         {v4.d}[0], [x4], x5
     ld1         {v4.d}[1], [x4], x5
+.endif
+    \first      v18.8h, v3.8h, v0.8h
+.if \x == 4
     \first      v19.8h, v4.8h, v0.8h
 .endif
 .endm
@@ -1487,57 +1495,61 @@ endfunc
     ld1         {v1.8h}, [x1], x5
     \first      v16.8h, v1.8h, v0.8h
     ld1         {v2.8h}, [x2], x5
-    ld1         {v5.8h}, [x0], x7
-    \first      v17.8h, v2.8h, v0.8h
     ld1         {v3.8h}, [x3], x5
+    \first      v17.8h, v2.8h, v0.8h
+    ld1         {v5.8h}, [x0], x7
     ld1         {v1.8h}, [x1], x5
     \first      v18.8h, v3.8h, v0.8h
-    uaba        v16.8h, v1.8h, v5.8h
     ld1         {v2.8h}, [x2], x5
+    uaba        v16.8h, v1.8h, v5.8h
     ld1         {v3.8h}, [x3], x5
     uaba        v17.8h, v2.8h, v5.8h
-    uaba        v18.8h, v3.8h, v5.8h
 .if \x == 4
     ld1         {v4.8h}, [x4], x5
+    ld1         {v1.8h}, [x4], x5
+.endif
+    uaba        v18.8h, v3.8h, v5.8h
+.if \x == 4
     \first      v19.8h, v4.8h, v0.8h
-    ld1         {v4.8h}, [x4], x5
-    uaba        v19.8h, v4.8h, v5.8h
+    uaba        v19.8h, v1.8h, v5.8h
 .endif
 .endm
 
 .macro SAD_X_16 x, first=uaba
     ld1         {v0.8h, v1.8h}, [x0], x7
     ld1         {v2.8h, v3.8h}, [x1], x5
-    \first      v16.8h, v2.8h, v0.8h
-    \first      v20.8h, v3.8h, v1.8h
 
     ld1         {v4.8h, v5.8h}, [x2], x5
-    ld1         {v6.8h, v7.8h}, [x0], x7
+    \first      v16.8h, v2.8h, v0.8h
+    \first      v20.8h, v3.8h, v1.8h
+    ld1         {v24.8h, v25.8h}, [x3], x5
     \first      v17.8h, v4.8h, v0.8h
     \first      v21.8h, v5.8h, v1.8h
 
-    ld1         {v24.8h, v25.8h}, [x3], x5
+    ld1         {v6.8h, v7.8h}, [x0], x7
     ld1         {v2.8h, v3.8h}, [x1], x5
     \first      v18.8h, v24.8h, v0.8h
     \first      v22.8h, v25.8h, v1.8h
+    ld1         {v4.8h, v5.8h}, [x2], x5
     uaba        v16.8h, v2.8h, v6.8h
     uaba        v20.8h, v3.8h, v7.8h
 
-    ld1         {v4.8h, v5.8h}, [x2], x5
     ld1         {v24.8h, v25.8h}, [x3], x5
     uaba        v17.8h, v4.8h, v6.8h
     uaba        v21.8h, v5.8h, v7.8h
 
+.if \x == 4
+    ld1         {v26.8h, v27.8h}, [x4], x5
+    ld1         {v28.8h, v29.8h}, [x4], x5
+.endif
     uaba        v18.8h, v24.8h, v6.8h
     uaba        v22.8h, v25.8h, v7.8h
 .if \x == 4
-    ld1         {v26.8h, v27.8h}, [x4], x5
     \first      v19.8h, v26.8h, v0.8h
     \first      v23.8h, v27.8h, v1.8h
 
-    ld1         {v26.8h, v27.8h}, [x4], x5
-    uaba        v19.8h, v26.8h, v6.8h
-    uaba        v23.8h, v27.8h, v7.8h
+    uaba        v19.8h, v28.8h, v6.8h
+    uaba        v23.8h, v29.8h, v7.8h
 .endif
 .endm
 



View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/4664f5aa66166ee3e11d99bfd6cbc7064abf76cc

-- 
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/4664f5aa66166ee3e11d99bfd6cbc7064abf76cc
You're receiving this email because of your account on code.videolan.org.


VideoLAN code repository instance


More information about the x264-devel mailing list