[x265] [PATCH] asm: improve sad[32x32] 10% by unroll loop
Min Chen
chenm003 at 163.com
Mon Mar 16 22:00:48 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426539636 25200
# Node ID 117fb09221983c5f50988741168a216d35e3581a
# Parent d33fc159951225e42889071ef3d877d23f693197
asm: improve sad[32x32] 10% by unroll loop
---
source/common/x86/sad-a.asm | 25 ++++++++++++++++++-------
1 files changed, 18 insertions(+), 7 deletions(-)
diff -r d33fc1599512 -r 117fb0922198 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Mar 16 12:00:42 2015 -0700
+++ b/source/common/x86/sad-a.asm Mon Mar 16 14:00:36 2015 -0700
@@ -3898,9 +3898,11 @@
RET
INIT_YMM avx2
-cglobal pixel_sad_32x32, 4,5,5
+cglobal pixel_sad_32x32, 4,7,5
xorps m0, m0
- mov r4d, 16
+ mov r4d, 32/4
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
.loop
movu m1, [r0] ; row 0 of pix0
@@ -3913,11 +3915,21 @@
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
-
- dec r4d
- jnz .loop
+ movu m1, [r0 + 2 * r1] ; row 2 of pix0
+ movu m2, [r2 + 2 * r3] ; row 2 of pix1
+ movu m3, [r0 + r5] ; row 3 of pix0
+ movu m4, [r2 + r6] ; row 3 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+
+ dec r4d
+ jnz .loop
vextracti128 xm1, m0, 1
paddd xm0, xm1
@@ -3926,5 +3938,4 @@
movd eax, xm0
RET
-
%endif
More information about the x265-devel
mailing list