[x265] [PATCH] asm: avx2 code for satd_32xN
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Apr 14 11:03:36 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1429001011 -19800
# Tue Apr 14 14:13:31 2015 +0530
# Node ID 5644a5b24ce03290de3a5bb0fc4d49cc00a19ae2
# Parent dd456de98c239b86e29bf349881854a699056240
asm: avx2 code for satd_32xN
AVX2:
satd[ 32x8] 8.40x 957.22 8040.38
satd[32x16] 8.31x 1950.86 16214.44
satd[32x24] 8.50x 2897.62 24636.81
satd[32x32] 8.88x 3952.35 35115.40
satd[32x64] 9.18x 7334.90 67312.13
AVX:
satd[ 32x8] 4.63x 1738.62 8048.18
satd[32x16] 5.01x 3249.63 16295.51
satd[32x24] 5.30x 4767.54 25279.60
satd[32x32] 5.67x 6156.74 34895.57
satd[32x64] 5.59x 11708.14 65479.60
diff -r dd456de98c23 -r 5644a5b24ce0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp Tue Apr 14 14:13:31 2015 +0530
@@ -1669,6 +1669,12 @@
p.pu[LUMA_8x16].satd = x265_pixel_satd_8x16_avx2;
p.pu[LUMA_8x8].satd = x265_pixel_satd_8x8_avx2;
+ p.pu[LUMA_32x8].satd = x265_pixel_satd_32x8_avx2;
+ p.pu[LUMA_32x16].satd = x265_pixel_satd_32x16_avx2;
+ p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_avx2;
+ p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_avx2;
+ p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_avx2;
+
p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2;
diff -r dd456de98c23 -r 5644a5b24ce0 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/pixel-a.asm Tue Apr 14 14:13:31 2015 +0530
@@ -10506,3 +10506,303 @@
mov rsp, r5
RET
%endif
+
+;;---------------------------------------------------------------
+;; SATD AVX2
+;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t)
+;;---------------------------------------------------------------
+;; r0 - pix0
+;; r1 - pix0Stride
+;; r2 - pix1
+;; r3 - pix1Stride
+
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
+ movu xm4, [r0]
+ movu xm5, [r0 + r1]
+ movu xm0, [r2]
+ movu xm1, [r2 + r3]
+
+ vpermq m4, m4, 01010000b
+ vpermq m5, m5, 01010000b
+ vpermq m0, m0, 01010000b
+ vpermq m1, m1, 01010000b
+
+ pmaddubsw m4, m7
+ pmaddubsw m0, m7
+ pmaddubsw m5, m7
+ pmaddubsw m1, m7
+ psubw m0, m4
+ psubw m1, m5
+
+ movu xm4, [r0 + r1 * 2]
+ movu xm5, [r0 + r4]
+ movu xm2, [r2 + r3 * 2]
+ movu xm3, [r2 + r5]
+
+ vpermq m4, m4, 01010000b
+ vpermq m5, m5, 01010000b
+ vpermq m2, m2, 01010000b
+ vpermq m3, m3, 01010000b
+
+ pmaddubsw m4, m7
+ pmaddubsw m2, m7
+ pmaddubsw m5, m7
+ pmaddubsw m3, m7
+ psubw m2, m4
+ psubw m3, m5
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ paddw m4, m0, m1
+ psubw m1, m1, m0
+ paddw m0, m2, m3
+ psubw m3, m3, m2
+ paddw m2, m4, m0
+ psubw m0, m0, m4
+ paddw m4, m1, m3
+ psubw m3, m3, m1
+ pabsw m2, m2
+ pabsw m0, m0
+ pabsw m4, m4
+ pabsw m3, m3
+ pblendw m1, m2, m0, 10101010b
+ pslld m0, m0, 16
+ psrld m2, m2, 16
+ por m0, m0, m2
+ pmaxsw m1, m1, m0
+ pxor m9, m9, m9
+ mova m8, m1
+ punpcklwd m8, m8, m9
+ paddd m6, m6, m8
+ mova m8, m1
+ punpckhwd m8, m8, m9
+ paddd m6, m6, m8
+ pblendw m2, m4, m3, 10101010b
+ pslld m3, m3, 16
+ psrld m4, m4, 16
+ por m3, m3, m4
+ pmaxsw m2, m2, m3
+ pxor m9, m9, m9
+ mova m8, m2
+ punpcklwd m8, m8, m9
+ paddd m6, m6, m8
+ mova m8, m2
+ punpckhwd m8, m8, m9
+ paddd m6, m6, m8
+
+ movu xm4, [r0]
+ movu xm5, [r0 + r1]
+ movu xm1, [r2]
+ movu xm2, [r2 + r3]
+
+ vpermq m4, m4, 01010000b
+ vpermq m5, m5, 01010000b
+ vpermq m1, m1, 01010000b
+ vpermq m2, m2, 01010000b
+
+ pmaddubsw m4, m4, m7
+ pmaddubsw m1, m1, m7
+ pmaddubsw m5, m5, m7
+ pmaddubsw m2, m2, m7
+ psubw m1, m1, m4
+ psubw m2, m2, m5
+
+ movu xm4, [r0 + r1 * 2]
+ movu xm5, [r0 + r4]
+ movu xm0, [r2 + r3 * 2]
+ movu xm3, [r2 + r5]
+
+ vpermq m4, m4, 01010000b
+ vpermq m5, m5, 01010000b
+ vpermq m0, m0, 01010000b
+ vpermq m3, m3, 01010000b
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+
+ pmaddubsw m4, m4, m7
+ pmaddubsw m0, m0, m7
+ pmaddubsw m5, m5, m7
+ pmaddubsw m3, m3, m7
+ psubw m0, m0, m4
+ psubw m3, m3, m5
+ paddw m4, m1, m2
+ psubw m2, m2, m1
+ paddw m1, m0, m3
+ psubw m3, m3, m0
+ paddw m0, m4, m1
+ psubw m1, m1, m4
+ paddw m4, m2, m3
+ psubw m3, m3, m2
+ pabsw m0, m0
+ pabsw m1, m1
+ pabsw m4, m4
+ pabsw m3, m3
+ pblendw m2, m0, m1, 10101010b
+ pslld m1, m1, 16
+ psrld m0, m0, 16
+ por m1, m1, m0
+ pmaxsw m2, m2, m1
+ pxor m9, m9, m9
+ mova m8, m2
+ punpcklwd m8, m8, m9
+ paddd m6, m6, m8
+ mova m8, m2
+ punpckhwd m8, m8, m9
+ paddd m6, m6, m8
+ pblendw m0, m4, m3, 10101010b
+ pslld m3, m3, 16
+ psrld m4, m4, 16
+ por m3, m3, m4
+ pmaxsw m0, m0, m3
+ pxor m9, m9, m9
+ mova m8, m0
+ punpcklwd m8, m8, m9
+ paddd m6, m6, m8
+ mova m8, m0
+ punpckhwd m8, m8, m9
+ paddd m6, m6, m8
+ ret
+
+cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_8p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+
+ call calc_satd_16x8
+
+ vextracti128 xm8, m6, 1
+ paddd xm6, xm8
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+ RET
+
+cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_8p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ vextracti128 xm8, m6, 1
+ paddd xm6, xm8
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+ RET
+
+cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_8p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ vextracti128 xm8, m6, 1
+ paddd xm6, xm8
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+ RET
+
+cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_8p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ vextracti128 xm8, m6, 1
+ paddd xm6, xm8
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+ RET
+
+cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_8p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ vextracti128 xm8, m6, 1
+ paddd xm6, xm8
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+ RET
+
+%endif ; if ARCH_X86_64 == 1
More information about the x265-devel
mailing list