[x265] [PATCH] asm: new optimized algorithm for satd, improved ~30% over previous algorithm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Apr 16 08:25:48 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1429164512 -19800
# Thu Apr 16 11:38:32 2015 +0530
# Node ID 507135d8bcdcb496783c49b4b0304b961a68c253
# Parent f9c0e1f233cc15ccce4eb96adef11583af082f33
asm: new optimized algorithm for satd, improved ~30% over previous algorithm
diff -r f9c0e1f233cc -r 507135d8bcdc source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Apr 15 16:20:27 2015 +0530
+++ b/source/common/x86/pixel-a.asm Thu Apr 16 11:38:32 2015 +0530
@@ -10519,156 +10519,102 @@
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
- movu xm4, [r0]
- movu xm5, [r0 + r1]
- movu xm0, [r2]
- movu xm1, [r2 + r3]
-
- vpermq m4, m4, 01010000b
- vpermq m5, m5, 01010000b
- vpermq m0, m0, 01010000b
- vpermq m1, m1, 01010000b
-
- pmaddubsw m4, m7
- pmaddubsw m0, m7
- pmaddubsw m5, m7
- pmaddubsw m1, m7
- psubw m0, m4
- psubw m1, m5
-
- movu xm4, [r0 + r1 * 2]
- movu xm5, [r0 + r4]
- movu xm2, [r2 + r3 * 2]
- movu xm3, [r2 + r5]
-
- vpermq m4, m4, 01010000b
- vpermq m5, m5, 01010000b
- vpermq m2, m2, 01010000b
- vpermq m3, m3, 01010000b
-
- pmaddubsw m4, m7
- pmaddubsw m2, m7
- pmaddubsw m5, m7
- pmaddubsw m3, m7
- psubw m2, m4
- psubw m3, m5
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
- paddw m4, m0, m1
- psubw m1, m1, m0
- paddw m0, m2, m3
- psubw m3, m3, m2
- paddw m2, m4, m0
- psubw m0, m0, m4
- paddw m4, m1, m3
- psubw m3, m3, m1
- pabsw m2, m2
- pabsw m0, m0
- pabsw m4, m4
- pabsw m3, m3
- pblendw m1, m2, m0, 10101010b
- pslld m0, m0, 16
- psrld m2, m2, 16
- por m0, m0, m2
- pmaxsw m1, m1, m0
- pxor m9, m9, m9
- mova m8, m1
- punpcklwd m8, m8, m9
- paddd m6, m6, m8
- mova m8, m1
- punpckhwd m8, m8, m9
- paddd m6, m6, m8
- pblendw m2, m4, m3, 10101010b
- pslld m3, m3, 16
- psrld m4, m4, 16
- por m3, m3, m4
- pmaxsw m2, m2, m3
- pxor m9, m9, m9
- mova m8, m2
- punpcklwd m8, m8, m9
- paddd m6, m6, m8
- mova m8, m2
- punpckhwd m8, m8, m9
- paddd m6, m6, m8
-
- movu xm4, [r0]
- movu xm5, [r0 + r1]
- movu xm1, [r2]
- movu xm2, [r2 + r3]
-
- vpermq m4, m4, 01010000b
- vpermq m5, m5, 01010000b
- vpermq m1, m1, 01010000b
- vpermq m2, m2, 01010000b
-
- pmaddubsw m4, m4, m7
- pmaddubsw m1, m1, m7
- pmaddubsw m5, m5, m7
- pmaddubsw m2, m2, m7
- psubw m1, m1, m4
- psubw m2, m2, m5
-
- movu xm4, [r0 + r1 * 2]
- movu xm5, [r0 + r4]
- movu xm0, [r2 + r3 * 2]
- movu xm3, [r2 + r5]
-
- vpermq m4, m4, 01010000b
- vpermq m5, m5, 01010000b
- vpermq m0, m0, 01010000b
- vpermq m3, m3, 01010000b
-
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
-
- pmaddubsw m4, m4, m7
- pmaddubsw m0, m0, m7
- pmaddubsw m5, m5, m7
- pmaddubsw m3, m3, m7
- psubw m0, m0, m4
- psubw m3, m3, m5
- paddw m4, m1, m2
- psubw m2, m2, m1
- paddw m1, m0, m3
- psubw m3, m3, m0
- paddw m0, m4, m1
- psubw m1, m1, m4
- paddw m4, m2, m3
- psubw m3, m3, m2
- pabsw m0, m0
- pabsw m1, m1
- pabsw m4, m4
- pabsw m3, m3
- pblendw m2, m0, m1, 10101010b
- pslld m1, m1, 16
- psrld m0, m0, 16
- por m1, m1, m0
- pmaxsw m2, m2, m1
- pxor m9, m9, m9
- mova m8, m2
- punpcklwd m8, m8, m9
- paddd m6, m6, m8
- mova m8, m2
- punpckhwd m8, m8, m9
- paddd m6, m6, m8
- pblendw m0, m4, m3, 10101010b
- pslld m3, m3, 16
- psrld m4, m4, 16
- por m3, m3, m4
- pmaxsw m0, m0, m3
- pxor m9, m9, m9
- mova m8, m0
- punpcklwd m8, m8, m9
- paddd m6, m6, m8
- mova m8, m0
- punpckhwd m8, m8, m9
- paddd m6, m6, m8
+ vbroadcasti128 m0, [r0]
+ vbroadcasti128 m4, [r2]
+ vbroadcasti128 m1, [r0 + r1]
+ vbroadcasti128 m5, [r2 + r3]
+ pmaddubsw m4, m7
+ pmaddubsw m0, m7
+ pmaddubsw m5, m7
+ pmaddubsw m1, m7
+ psubw m0, m4
+ psubw m1, m5
+ vbroadcasti128 m2, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r2 + r3 * 2]
+ vbroadcasti128 m3, [r0 + r4]
+ vbroadcasti128 m5, [r2 + r5]
+ pmaddubsw m4, m7
+ pmaddubsw m2, m7
+ pmaddubsw m5, m7
+ pmaddubsw m3, m7
+ psubw m2, m4
+ psubw m3, m5
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ paddw m4, m0, m1
+ psubw m1, m1, m0
+ paddw m0, m2, m3
+ psubw m3, m2
+ paddw m2, m4, m0
+ psubw m0, m4
+ paddw m4, m1, m3
+ psubw m3, m1
+ pabsw m2, m2
+ pabsw m0, m0
+ pabsw m4, m4
+ pabsw m3, m3
+ pblendw m1, m2, m0, 10101010b
+ pslld m0, 16
+ psrld m2, 16
+ por m0, m2
+ pmaxsw m1, m0
+ paddw m6, m1
+ pblendw m2, m4, m3, 10101010b
+ pslld m3, 16
+ psrld m4, 16
+ por m3, m4
+ pmaxsw m2, m3
+ paddw m6, m2
+ vbroadcasti128 m1, [r0]
+ vbroadcasti128 m4, [r2]
+ vbroadcasti128 m2, [r0 + r1]
+ vbroadcasti128 m5, [r2 + r3]
+ pmaddubsw m4, m7
+ pmaddubsw m1, m7
+ pmaddubsw m5, m7
+ pmaddubsw m2, m7
+ psubw m1, m4
+ psubw m2, m5
+ vbroadcasti128 m0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r2 + r3 * 2]
+ vbroadcasti128 m3, [r0 + r4]
+ vbroadcasti128 m5, [r2 + r5]
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ pmaddubsw m4, m7
+ pmaddubsw m0, m7
+ pmaddubsw m5, m7
+ pmaddubsw m3, m7
+ psubw m0, m4
+ psubw m3, m5
+ paddw m4, m1, m2
+ psubw m2, m1
+ paddw m1, m0, m3
+ psubw m3, m0
+ paddw m0, m4, m1
+ psubw m1, m4
+ paddw m4, m2, m3
+ psubw m3, m2
+ pabsw m0, m0
+ pabsw m1, m1
+ pabsw m4, m4
+ pabsw m3, m3
+ pblendw m2, m0, m1, 10101010b
+ pslld m1, 16
+ psrld m0, 16
+ por m1, m0
+ pmaxsw m2, m1
+ paddw m6, m2
+ pblendw m0, m4, m3, 10101010b
+ pslld m3, 16
+ psrld m4, 16
+ por m3, m4
+ pmaxsw m0, m3
+ paddw m6, m0
ret
-cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2)
- mova m7, [hmul_8p]
+cglobal pixel_satd_32x8, 4,8,8 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
@@ -10682,17 +10628,18 @@
call calc_satd_16x8
- vextracti128 xm8, m6, 1
- paddd xm6, xm8
- movhlps xm7, xm6
- paddd xm6, xm7
- pshufd xm7, xm6, 1
- paddd xm6, xm7
- movd eax, xm6
- RET
-
-cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
- mova m7, [hmul_8p]
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ pmaddwd xm0, [pw_1]
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_32x16, 4,8,8 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
@@ -10701,22 +10648,25 @@
call calc_satd_16x8
call calc_satd_16x8
+
lea r0, [r6 + 16]
lea r2, [r7 + 16]
+
call calc_satd_16x8
call calc_satd_16x8
- vextracti128 xm8, m6, 1
- paddd xm6, xm8
- movhlps xm7, xm6
- paddd xm6, xm7
- pshufd xm7, xm6, 1
- paddd xm6, xm7
- movd eax, xm6
- RET
-
-cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2)
- mova m7, [hmul_8p]
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ pmaddwd xm0, [pw_1]
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_32x24, 4,8,8 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
@@ -10726,23 +10676,26 @@
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
+
lea r0, [r6 + 16]
lea r2, [r7 + 16]
+
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
- vextracti128 xm8, m6, 1
- paddd xm6, xm8
- movhlps xm7, xm6
- paddd xm6, xm7
- pshufd xm7, xm6, 1
- paddd xm6, xm7
- movd eax, xm6
- RET
-
-cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
- mova m7, [hmul_8p]
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ pmaddwd xm0, [pw_1]
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_32x32, 4,8,8 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
@@ -10753,24 +10706,27 @@
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
+
lea r0, [r6 + 16]
lea r2, [r7 + 16]
+
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
- vextracti128 xm8, m6, 1
- paddd xm6, xm8
- movhlps xm7, xm6
- paddd xm6, xm7
- pshufd xm7, xm6, 1
- paddd xm6, xm7
- movd eax, xm6
- RET
-
-cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
- mova m7, [hmul_8p]
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ pmaddwd xm0, [pw_1]
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_32x64, 4,8,9 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
@@ -10785,8 +10741,13 @@
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
+
+ mova m8, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+
lea r0, [r6 + 16]
lea r2, [r7 + 16]
+
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
@@ -10796,13 +10757,18 @@
call calc_satd_16x8
call calc_satd_16x8
- vextracti128 xm8, m6, 1
- paddd xm6, xm8
- movhlps xm7, xm6
- paddd xm6, xm7
- pshufd xm7, xm6, 1
- paddd xm6, xm7
- movd eax, xm6
+ vextracti128 xm1, m8, 1
+ vextracti128 xm0, m6, 1
+ paddw xm1, xm8
+ paddw xm0, xm6
+ pmaddwd xm1, [pw_1]
+ pmaddwd xm0, [pw_1]
+ paddd xm0, xm1
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
RET
%endif ; if ARCH_X86_64 == 1
More information about the x265-devel
mailing list