[x264-devel] [PATCH] prefetch_fenc_\sub\()_aarch64 parameter register number using error fix

zhengzhi Duan royzzduan at foxmail.com
Mon Sep 23 06:30:32 UTC 2024


From: Zhengzhi Duan <zhengzhi.duan at shopee.com>

this fix can bring 1.2fps speedup for veryfast at 720p on an average 50fps
---
 common/aarch64/mc-a.S | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 9c52d045..25baf0b4 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -57,21 +57,21 @@ endfunc
 //                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
 .macro prefetch_fenc sub
 function prefetch_fenc_\sub\()_aarch64, export=1
-    and         w6,  w5,  #3
-    and         w7,  w5,  #3
-    mul         x6,  x6,  x1
-    mul         x7,  x7,  x3
+    and         w5,  w4,  #3
+    and         w6,  w4,  #3
+    mul         x5,  x5,  x1
+    mul         x6,  x6,  x3
     add         x0,  x0,  #64
     add         x2,  x2,  #64
 
-    add         x0,  x0,  x6,  lsl #2
-    add         x6,  x0,  x1,  lsl #1
+    add         x0,  x0,  x5,  lsl #2
+    add         x5,  x0,  x1,  lsl #1
     prfm        pldl1strm, [x0]
     prfm        pldl1strm, [x0,  x1]
-    prfm        pldl1strm, [x6]
-    prfm        pldl1strm, [x6, x1]
+    prfm        pldl1strm, [x5]
+    prfm        pldl1strm, [x5, x1]
 
-    add         x2,  x2,  x7,  lsl #1
+    add         x2,  x2,  x6,  lsl #1
     prfm        pldl1strm, [x2]
     prfm        pldl1strm, [x2,  x3]
 .ifc \sub, 422
-- 
2.39.3 (Apple Git-145)



More information about the x264-devel mailing list