[x265] [PATCH] asm: improve sad[32x32] 10% by unroll loop

Min Chen chenm003 at 163.com
Mon Mar 16 22:00:48 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426539636 25200
# Node ID 117fb09221983c5f50988741168a216d35e3581a
# Parent  d33fc159951225e42889071ef3d877d23f693197
asm: improve sad[32x32] 10% by unroll loop
---
 source/common/x86/sad-a.asm |   25 ++++++++++++++++++-------
 1 files changed, 18 insertions(+), 7 deletions(-)

diff -r d33fc1599512 -r 117fb0922198 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Mar 16 12:00:42 2015 -0700
+++ b/source/common/x86/sad-a.asm	Mon Mar 16 14:00:36 2015 -0700
@@ -3898,9 +3898,11 @@
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x32, 4,5,5
+cglobal pixel_sad_32x32, 4,7,5
     xorps           m0, m0
-    mov             r4d, 16
+    mov             r4d, 32/4
+    lea             r5, [r1 * 3]
+    lea             r6, [r3 * 3]
 
 .loop
     movu           m1, [r0]               ; row 0 of pix0
@@ -3913,11 +3915,21 @@
     paddd          m0, m1
     paddd          m0, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
-
-    dec         r4d
-    jnz         .loop
+    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
+    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
+    movu           m3, [r0 + r5]          ; row 3 of pix0
+    movu           m4, [r2 + r6]          ; row 3 of pix1
+
+    psadbw         m1, m2
+    psadbw         m3, m4
+    paddd          m0, m1
+    paddd          m0, m3
+
+    lea            r2,     [r2 + 4 * r3]
+    lea            r0,     [r0 + 4 * r1]
+
+    dec            r4d
+    jnz           .loop
 
     vextracti128   xm1, m0, 1
     paddd          xm0, xm1
@@ -3926,5 +3938,4 @@
     movd            eax, xm0
     RET
 
-
 %endif



More information about the x265-devel mailing list