[x265] [PATCH] asm: improve the old avx2 code for sad[64x64]
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Mon Apr 6 12:31:12 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1428312216 -19800
# Mon Apr 06 14:53:36 2015 +0530
# Node ID ee87bb231905d5d8462e763d7e25c7957f52af2a
# Parent 7fcb5dd81aa6b7dd20ce072e32b18e1ffa6bd5c1
asm: improve the old avx2 code for sad[64x64]
old:
sad[64x64] 21.47x 1702.40 36545.14
new:
sad[64x64] 22.89x 1595.16 36506.87
diff -r 7fcb5dd81aa6 -r ee87bb231905 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Apr 06 11:47:55 2015 +0530
+++ b/source/common/x86/sad-a.asm Mon Apr 06 14:53:36 2015 +0530
@@ -4346,10 +4346,12 @@
RET
INIT_YMM avx2
-cglobal pixel_sad_64x64, 4,5,6
+cglobal pixel_sad_64x64, 4,7,6
xorps m0, m0
xorps m5, m5
mov r4d, 8
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
.loop
movu m1, [r0] ; first 32 of row 0 of pix0
movu m2, [r2] ; first 32 of row 0 of pix1
@@ -4371,31 +4373,28 @@
paddd m0, m1
paddd m5, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
-
- movu m1, [r0] ; first 32 of row 2 of pix0
- movu m2, [r2] ; first 32 of row 2 of pix1
- movu m3, [r0 + 32] ; second 32 of row 2 of pix0
- movu m4, [r2 + 32] ; second 32 of row 2 of pix1
+ movu m1, [r0 + 2 * r1] ; first 32 of row 2 of pix0
+ movu m2, [r2 + 2 * r3] ; first 32 of row 2 of pix1
+ movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 2 of pix0
+ movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 2 of pix1
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
paddd m5, m3
- movu m1, [r0 + r1] ; first 32 of row 3 of pix0
- movu m2, [r2 + r3] ; first 32 of row 3 of pix1
- movu m3, [r0 + 32 + r1] ; second 32 of row 3 of pix0
- movu m4, [r2 + 32 + r3] ; second 32 of row 3 of pix1
+ movu m1, [r0 + r5] ; first 32 of row 3 of pix0
+ movu m2, [r2 + r6] ; first 32 of row 3 of pix1
+ movu m3, [r0 + 32 + r5] ; second 32 of row 3 of pix0
+ movu m4, [r2 + 32 + r6] ; second 32 of row 3 of pix1
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
paddd m5, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
movu m1, [r0] ; first 32 of row 4 of pix0
movu m2, [r2] ; first 32 of row 4 of pix1
@@ -4417,31 +4416,28 @@
paddd m0, m1
paddd m5, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
-
- movu m1, [r0] ; first 32 of row 6 of pix0
- movu m2, [r2] ; first 32 of row 6 of pix1
- movu m3, [r0 + 32] ; second 32 of row 6 of pix0
- movu m4, [r2 + 32] ; second 32 of row 6 of pix1
+ movu m1, [r0 + 2 * r1] ; first 32 of row 6 of pix0
+ movu m2, [r2 + 2 * r3] ; first 32 of row 6 of pix1
+ movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 6 of pix0
+ movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 6 of pix1
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
paddd m5, m3
- movu m1, [r0 + r1] ; first 32 of row 7 of pix0
- movu m2, [r2 + r3] ; first 32 of row 7 of pix1
- movu m3, [r0 + 32 + r1] ; second 32 of row 7 of pix0
- movu m4, [r2 + 32 + r3] ; second 32 of row 7 of pix1
+ movu m1, [r0 + r5] ; first 32 of row 7 of pix0
+ movu m2, [r2 + r6] ; first 32 of row 7 of pix1
+ movu m3, [r0 + 32 + r5] ; second 32 of row 7 of pix0
+ movu m4, [r2 + 32 + r6] ; second 32 of row 7 of pix1
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
paddd m5, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
dec r4d
jnz .loop
More information about the x265-devel
mailing list