[x265] [PATCH] asm: improve old avx2 code for sad[64x48]
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Mon Apr 6 12:31:27 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1428315710 -19800
# Mon Apr 06 15:51:50 2015 +0530
# Node ID 6fd0c740b7c417b7b54627ead1edd288f33718f4
# Parent ee87bb231905d5d8462e763d7e25c7957f52af2a
asm: improve old avx2 code for sad[64x48]
old:
sad[64x48] 16.79x 1504.65 25267.23
new:
sad[64x48] 20.18x 1260.99 25451.33
diff -r ee87bb231905 -r 6fd0c740b7c4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Apr 06 14:53:36 2015 +0530
+++ b/source/common/x86/sad-a.asm Mon Apr 06 15:51:50 2015 +0530
@@ -4306,10 +4306,12 @@
RET
INIT_YMM avx2
-cglobal pixel_sad_64x48, 4,5,6
+cglobal pixel_sad_64x48, 4,7,6
xorps m0, m0
xorps m5, m5
- mov r4d, 24
+ mov r4d, 12
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
.loop
movu m1, [r0] ; first 32 of row 0 of pix0
movu m2, [r2] ; first 32 of row 0 of pix1
@@ -4331,8 +4333,28 @@
paddd m0, m1
paddd m5, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
+ movu m1, [r0 + 2 * r1] ; first 32 of row 0 of pix0
+ movu m2, [r2 + 2 * r3] ; first 32 of row 0 of pix1
+ movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 0 of pix0
+ movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 0 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m5, m3
+
+ movu m1, [r0 + r5] ; first 32 of row 1 of pix0
+ movu m2, [r2 + r6] ; first 32 of row 1 of pix1
+ movu m3, [r0 + 32 + r5] ; second 32 of row 1 of pix0
+ movu m4, [r2 + 32 + r6] ; second 32 of row 1 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m5, m3
+
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
dec r4d
jnz .loop
More information about the x265-devel
mailing list