[x265] [PATCH 2 of 3] asm: psyCost_pp avx2 asm code for main12
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Dec 9 09:50:51 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1448963172 -19800
# Tue Dec 01 15:16:12 2015 +0530
# Node ID 9357c1f448a7b987cebfd3cc5542cc6c65e63fe2
# Parent e2b07541670331ab0cd94b5f312f8f7cac893f92
asm: psyCost_pp avx2 asm code for main12
psy_cost_pp[8x8] 6.55x 1254.76 8224.62
psy_cost_pp[16x16] 6.51x 5087.56 33111.62
psy_cost_pp[32x32] 6.50x 20230.92 131523.63
psy_cost_pp[64x64] 6.57x 80351.48 528226.25
diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 09 13:13:57 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 01 15:16:12 2015 +0530
@@ -1479,12 +1479,11 @@
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
-#if X265_DEPTH <= 10
+
p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-#endif
p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r e2b075416703 -r 9357c1f448a7 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 09 13:13:57 2015 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 01 15:16:12 2015 +0530
@@ -10090,16 +10090,272 @@
pabsd xm1, xm1
%endmacro
+%macro PSY_COST_PP_8x8_MAIN12 0
+ ; load source pixels
+ lea r4, [r1 * 3]
+ pmovzxwd m0, [r0]
+ pmovzxwd m1, [r0 + r1]
+ pmovzxwd m2, [r0 + r1 * 2]
+ pmovzxwd m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ pmovzxwd m4, [r5]
+ pmovzxwd m5, [r5 + r1]
+ pmovzxwd m6, [r5 + r1 * 2]
+ pmovzxwd m7, [r5 + r4]
+
+ ; source SAD
+ paddd m8, m0, m1
+ paddd m8, m2
+ paddd m8, m3
+ paddd m8, m4
+ paddd m8, m5
+ paddd m8, m6
+ paddd m8, m7
+
+ vextracti128 xm9, m8, 1
+ paddd m8, m9 ; sad_8x8
+ movhlps xm9, xm8
+ paddd xm8, xm9
+ pshuflw xm9, xm8, 0Eh
+ paddd xm8, xm9
+ psrld m8, 2
+
+ ; source SA8D
+ psubd m9, m1, m0
+ paddd m0, m1
+ psubd m1, m3, m2
+ paddd m2, m3
+ punpckhdq m3, m0, m9
+ punpckldq m0, m9
+ psubd m9, m3, m0
+ paddd m0, m3
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m10, m3, m2
+ paddd m2, m3
+ psubd m3, m5, m4
+ paddd m4, m5
+ psubd m5, m7, m6
+ paddd m6, m7
+ punpckhdq m1, m4, m3
+ punpckldq m4, m3
+ psubd m7, m1, m4
+ paddd m4, m1
+ punpckhdq m3, m6, m5
+ punpckldq m6, m5
+ psubd m1, m3, m6
+ paddd m6, m3
+ psubd m3, m2, m0
+ paddd m0, m2
+ psubd m2, m10, m9
+ paddd m9, m10
+ punpckhqdq m5, m0, m3
+ punpcklqdq m0, m3
+ psubd m10, m5, m0
+ paddd m0, m5
+ punpckhqdq m3, m9, m2
+ punpcklqdq m9, m2
+ psubd m5, m3, m9
+ paddd m9, m3
+ psubd m3, m6, m4
+ paddd m4, m6
+ psubd m6, m1, m7
+ paddd m7, m1
+ punpckhqdq m2, m4, m3
+ punpcklqdq m4, m3
+ psubd m1, m2, m4
+ paddd m4, m2
+ punpckhqdq m3, m7, m6
+ punpcklqdq m7, m6
+ psubd m2, m3, m7
+ paddd m7, m3
+ psubd m3, m4, m0
+ paddd m0, m4
+ psubd m4, m1, m10
+ paddd m10, m1
+ vinserti128 m6, m0, xm3, 1
+ vperm2i128 m0, m0, m3, 00110001b
+ pabsd m0, m0
+ pabsd m6, m6
+ pmaxsd m0, m6
+ vinserti128 m3, m10, xm4, 1
+ vperm2i128 m10, m10, m4, 00110001b
+ pabsd m10, m10
+ pabsd m3, m3
+ pmaxsd m10, m3
+ psubd m3, m7, m9
+ paddd m9, m7
+ psubd m7, m2, m5
+ paddd m5, m2
+ vinserti128 m4, m9, xm3, 1
+ vperm2i128 m9, m9, m3, 00110001b
+ pabsd m9, m9
+ pabsd m4, m4
+ pmaxsd m9, m4
+ vinserti128 m3, m5, xm7, 1
+ vperm2i128 m5, m5, m7, 00110001b
+ pabsd m5, m5
+ pabsd m3, m3
+ pmaxsd m5, m3
+ paddd m0, m9
+ paddd m0, m10
+ paddd m0, m5
+
+ vextracti128 xm9, m0, 1
+ paddd m0, m9 ; sad_8x8
+ movhlps xm9, xm0
+ paddd xm0, xm9
+ pshuflw xm9, xm0, 0Eh
+ paddd xm0, xm9
+ paddd m0, [pd_1]
+ psrld m0, 1 ; sa8d_8x8
+ psubd m11, m0, m8 ; sa8d_8x8 - sad_8x8
+
+ ; load recon pixels
+ lea r4, [r3 * 3]
+ pmovzxwd m0, [r2]
+ pmovzxwd m1, [r2 + r3]
+ pmovzxwd m2, [r2 + r3 * 2]
+ pmovzxwd m3, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ pmovzxwd m4, [r5]
+ pmovzxwd m5, [r5 + r3]
+ pmovzxwd m6, [r5 + r3 * 2]
+ pmovzxwd m7, [r5 + r4]
+
+ ; recon SAD
+ paddd m8, m0, m1
+ paddd m8, m2
+ paddd m8, m3
+ paddd m8, m4
+ paddd m8, m5
+ paddd m8, m6
+ paddd m8, m7
+
+ vextracti128 xm9, m8, 1
+ paddd m8, m9 ; sad_8x8
+ movhlps xm9, xm8
+ paddd xm8, xm9
+ pshuflw xm9, xm8, 0Eh
+ paddd xm8, xm9
+ psrld m8, 2
+
+ ; recon SA8D
+ psubd m9, m1, m0
+ paddd m0, m1
+ psubd m1, m3, m2
+ paddd m2, m3
+ punpckhdq m3, m0, m9
+ punpckldq m0, m9
+ psubd m9, m3, m0
+ paddd m0, m3
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m10, m3, m2
+ paddd m2, m3
+ psubd m3, m5, m4
+ paddd m4, m5
+ psubd m5, m7, m6
+ paddd m6, m7
+ punpckhdq m1, m4, m3
+ punpckldq m4, m3
+ psubd m7, m1, m4
+ paddd m4, m1
+ punpckhdq m3, m6, m5
+ punpckldq m6, m5
+ psubd m1, m3, m6
+ paddd m6, m3
+ psubd m3, m2, m0
+ paddd m0, m2
+ psubd m2, m10, m9
+ paddd m9, m10
+ punpckhqdq m5, m0, m3
+ punpcklqdq m0, m3
+ psubd m10, m5, m0
+ paddd m0, m5
+ punpckhqdq m3, m9, m2
+ punpcklqdq m9, m2
+ psubd m5, m3, m9
+ paddd m9, m3
+ psubd m3, m6, m4
+ paddd m4, m6
+ psubd m6, m1, m7
+ paddd m7, m1
+ punpckhqdq m2, m4, m3
+ punpcklqdq m4, m3
+ psubd m1, m2, m4
+ paddd m4, m2
+ punpckhqdq m3, m7, m6
+ punpcklqdq m7, m6
+ psubd m2, m3, m7
+ paddd m7, m3
+ psubd m3, m4, m0
+ paddd m0, m4
+ psubd m4, m1, m10
+ paddd m10, m1
+ vinserti128 m6, m0, xm3, 1
+ vperm2i128 m0, m0, m3, 00110001b
+ pabsd m0, m0
+ pabsd m6, m6
+ pmaxsd m0, m6
+ vinserti128 m3, m10, xm4, 1
+ vperm2i128 m10, m10, m4, 00110001b
+ pabsd m10, m10
+ pabsd m3, m3
+ pmaxsd m10, m3
+ psubd m3, m7, m9
+ paddd m9, m7
+ psubd m7, m2, m5
+ paddd m5, m2
+ vinserti128 m4, m9, xm3, 1
+ vperm2i128 m9, m9, m3, 00110001b
+ pabsd m9, m9
+ pabsd m4, m4
+ pmaxsd m9, m4
+ vinserti128 m3, m5, xm7, 1
+ vperm2i128 m5, m5, m7, 00110001b
+ pabsd m5, m5
+ pabsd m3, m3
+ pmaxsd m5, m3
+ paddd m0, m9
+ paddd m0, m10
+ paddd m0, m5
+
+ vextracti128 xm9, m0, 1
+ paddd m0, m9 ; sad_8x8
+ movhlps xm9, xm0
+ paddd xm0, xm9
+ pshuflw xm9, xm0, 0Eh
+ paddd xm0, xm9
+ paddd m0, [pd_1]
+ psrld m0, 1 ; sa8d_8x8
+ psubd m0, m8 ; sa8d_8x8 - sad_8x8
+
+ psubd m11, m0
+ pabsd m11, m11
+%endmacro
+
%if ARCH_X86_64
-%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_8x8, 4, 8, 12
+ add r1d, r1d
+ add r3d, r3d
+ PSY_COST_PP_8x8_MAIN12
+ movd eax, xm11
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
cglobal psyCost_pp_8x8, 4, 8, 11
add r1d, r1d
add r3d, r3d
PSY_PP_8x8_AVX2
movd eax, xm1
RET
-%else ; !HIGH_BIT_DEPTH
-INIT_YMM avx2
+%endif
+
+%if BIT_DEPTH == 8
cglobal psyCost_pp_8x8, 4, 8, 13
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -10111,9 +10367,35 @@
RET
%endif
%endif
+
%if ARCH_X86_64
INIT_YMM avx2
-%if HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_16x16, 4, 10, 13
+ add r1d, r1d
+ add r3d, r3d
+ pxor m12, m12
+
+ mov r8d, 2
+.loopH:
+ mov r9d, 2
+.loopW:
+ PSY_COST_PP_8x8_MAIN12
+
+ paddd xm12, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r8d
+ jnz .loopH
+ movd eax, xm12
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
cglobal psyCost_pp_16x16, 4, 10, 12
add r1d, r1d
add r3d, r3d
@@ -10136,7 +10418,9 @@
jnz .loopH
movd eax, xm11
RET
-%else ; !HIGH_BIT_DEPTH
+%endif
+
+%if BIT_DEPTH == 8
cglobal psyCost_pp_16x16, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -10162,9 +10446,35 @@
RET
%endif
%endif
+
%if ARCH_X86_64
INIT_YMM avx2
-%if HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_32x32, 4, 10, 13
+ add r1d, r1d
+ add r3d, r3d
+ pxor m12, m12
+
+ mov r8d, 4
+.loopH:
+ mov r9d, 4
+.loopW:
+ PSY_COST_PP_8x8_MAIN12
+
+ paddd xm12, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r8d
+ jnz .loopH
+ movd eax, xm12
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
cglobal psyCost_pp_32x32, 4, 10, 12
add r1d, r1d
add r3d, r3d
@@ -10187,7 +10497,9 @@
jnz .loopH
movd eax, xm11
RET
-%else ; !HIGH_BIT_DEPTH
+%endif
+
+%if BIT_DEPTH == 8
cglobal psyCost_pp_32x32, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -10213,9 +10525,35 @@
RET
%endif
%endif
+
%if ARCH_X86_64
INIT_YMM avx2
-%if HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_64x64, 4, 10, 13
+ add r1d, r1d
+ add r3d, r3d
+ pxor m12, m12
+
+ mov r8d, 8
+.loopH:
+ mov r9d, 8
+.loopW:
+ PSY_COST_PP_8x8_MAIN12
+
+ paddd xm12, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r8d
+ jnz .loopH
+ movd eax, xm12
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
cglobal psyCost_pp_64x64, 4, 10, 12
add r1d, r1d
add r3d, r3d
@@ -10238,7 +10576,9 @@
jnz .loopH
movd eax, xm11
RET
-%else ; !HIGH_BIT_DEPTH
+%endif
+
+%if BIT_DEPTH == 8
cglobal psyCost_pp_64x64, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
More information about the x265-devel
mailing list