[x265] [PATCH] dct: unroll for 30% speedup on Apple Silicon

Sun Nov 24 04:34:48 UTC 2024

From: Ganesh Ajjanagadde <gajjanag at alum.mit.edu>

Apple silicon has 4 128 bit NEON execution units and benefits from unrolling.

>From ./TestBench on an M4 Mac Mini,

before:
dct8x8		 | 	2.32x | 	 205.12   | 	 476.62
dct16x16	 | 	2.02x | 	 801.20   | 	 1619.62
dct32x32	 | 	3.47x | 	 7566.39  | 	 26275.65
idct4x4		 | 	0.90x | 	 175.80   | 	 157.90
idct16x16	 | 	2.05x | 	 863.30   | 	 1771.80
idct32x32	 | 	1.79x | 	 6344.33  | 	 11351.99

after:
dct8x8		 | 	2.33x | 	 204.72   | 	 476.53
dct16x16	 | 	2.04x | 	 802.16   | 	 1637.39
dct32x32	 | 	4.96x | 	 5181.02  | 	 25700.34
idct4x4		 | 	1.08x | 	 162.09   | 	 174.40
idct16x16	 | 	1.95x | 	 910.01   | 	 1771.61
idct32x32	 | 	1.75x | 	 6350.72  | 	 11143.71

~2% end to end encoding speedup
---
 source/common/aarch64/dct-prim.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index 8b523ceb0..e6ee7005b 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -435,6 +435,7 @@ static inline void partialButterfly32_neon(const int16_t *src, int16_t *dst)
         for (int i = 0; i < line; i += 4)
         {
             int32x4_t t[4];
+X265_PRAGMA_UNROLL(4)
             for (int j = 0; j < 4; ++j) {
                 t[j] = vmull_s16(c0, vget_low_s16(O[i + j][0]));
                 t[j] = vmlal_s16(t[j], c1, vget_high_s16(O[i + j][0]));
@@ -461,6 +462,7 @@ static inline void partialButterfly32_neon(const int16_t *src, int16_t *dst)
         for (int i = 0; i < line; i += 4)
         {
             int32x4_t t[4];
+X265_PRAGMA_UNROLL(4)
             for (int j = 0; j < 4; ++j) {
                 t[j] = vmulq_s32(c0, EO[i + j][0]);
                 t[j] = vmlaq_s32(t[j], c1, EO[i + j][1]);
-- 
2.39.5 (Apple Git-154)