[x265] [PATCH] asm: avx2 code for psyCost_pp 8x8, 16x16, 32x32 & 64x64, improved over 40% than previous asm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue May 19 07:30:43 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431952399 -19800
# Mon May 18 18:03:19 2015 +0530
# Node ID ac32faec79be9c6a60d267086b4563bd884537c0
# Parent d7b100e51e828833eee006f1da93e499ac161d28
asm: avx2 code for psyCost_pp 8x8, 16x16, 32x32 & 64x64, improved over 40% than previous asm
diff -r d7b100e51e82 -r ac32faec79be source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon May 18 18:03:19 2015 +0530
@@ -1226,9 +1226,11 @@
p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_avx2;
p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_avx2;
-
p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
-
+ p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_avx2;
+ p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_avx2;
+ p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_avx2;
+ p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_avx2;
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
diff -r d7b100e51e82 -r ac32faec79be source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/pixel-a.asm Mon May 18 18:03:19 2015 +0530
@@ -8603,7 +8603,149 @@
pabsd m0, m0
%endmacro
+%macro PSY_PP_8x8_AVX2 0
+ lea r4, [r1 * 3]
+ movu xm0, [r0]
+ movu xm1, [r0 + r1]
+ movu xm2, [r0 + r1 * 2]
+ movu xm3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu xm4, [r5]
+ movu xm5, [r5 + r1]
+ movu xm6, [r5 + r1 * 2]
+ movu xm7, [r5 + r4]
+
+ lea r4, [r3 * 3]
+ vinserti128 m0, m0, [r2], 1
+ vinserti128 m1, m1, [r2 + r3], 1
+ vinserti128 m2, m2, [r2 + r3 * 2], 1
+ vinserti128 m3, m3, [r2 + r4], 1
+ lea r5, [r2 + r3 * 4]
+ vinserti128 m4, m4, [r5], 1
+ vinserti128 m5, m5, [r5 + r3], 1
+ vinserti128 m6, m6, [r5 + r3 * 2], 1
+ vinserti128 m7, m7, [r5 + r4], 1
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, [pw_1]
+
+ psrldq m9, m8, 8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ psubw m9, m1, m0
+ paddw m0, m1
+ psubw m1, m3, m2
+ paddw m2, m3
+ punpckhwd m3, m0, m9
+ punpcklwd m0, m9
+ psubw m9, m3, m0
+ paddw m0, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ psubw m10, m3, m2
+ paddw m2, m3
+ psubw m3, m5, m4
+ paddw m4, m5
+ psubw m5, m7, m6
+ paddw m6, m7
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ psubw m7, m1, m4
+ paddw m4, m1
+ punpckhwd m3, m6, m5
+ punpcklwd m6, m5
+ psubw m1, m3, m6
+ paddw m6, m3
+ psubw m3, m2, m0
+ paddw m0, m2
+ psubw m2, m10, m9
+ paddw m9, m10
+ punpckhdq m5, m0, m3
+ punpckldq m0, m3
+ psubw m10, m5, m0
+ paddw m0, m5
+ punpckhdq m3, m9, m2
+ punpckldq m9, m2
+ psubw m5, m3, m9
+ paddw m9, m3
+ psubw m3, m6, m4
+ paddw m4, m6
+ psubw m6, m1, m7
+ paddw m7, m1
+ punpckhdq m2, m4, m3
+ punpckldq m4, m3
+ psubw m1, m2, m4
+ paddw m4, m2
+ punpckhdq m3, m7, m6
+ punpckldq m7, m6
+ psubw m2, m3, m7
+ paddw m7, m3
+ psubw m3, m4, m0
+ paddw m0, m4
+ psubw m4, m1, m10
+ paddw m10, m1
+ punpckhqdq m6, m0, m3
+ punpcklqdq m0, m3
+ pabsw m0, m0
+ pabsw m6, m6
+ pmaxsw m0, m6
+ punpckhqdq m3, m10, m4
+ punpcklqdq m10, m4
+ pabsw m10, m10
+ pabsw m3, m3
+ pmaxsw m10, m3
+ psubw m3, m7, m9
+ paddw m9, m7
+ psubw m7, m2, m5
+ paddw m5, m2
+ punpckhqdq m4, m9, m3
+ punpcklqdq m9, m3
+ pabsw m9, m9
+ pabsw m4, m4
+ pmaxsw m9, m4
+ punpckhqdq m3, m5, m7
+ punpcklqdq m5, m7
+ pabsw m5, m5
+ pabsw m3, m3
+ pmaxsw m5, m3
+ paddd m0, m9
+ paddd m0, m10
+ paddd m0, m5
+ psrld m9, m0, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m0, m9
+ psrldq m9, m0, 8
+ paddd m0, m9
+ psrldq m9, m0, 4
+ paddd m0, m9
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m0, m8
+
+ vextracti128 xm1, m0, 1
+ psubd xm1, xm0
+ pabsd xm1, xm1
+%endmacro
+
%if ARCH_X86_64
+%if HIGH_BIT_DEPTH
+cglobal psyCost_pp_8x8, 4, 8, 11
+ add r1d, r1d
+ add r3d, r3d
+ PSY_PP_8x8_AVX2
+ movd eax, xm1
+ RET
+%else ; !HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal psyCost_pp_8x8, 4, 8, 13
lea r4, [3 * r1]
@@ -8615,9 +8757,33 @@
movd eax, xm0
RET
%endif
-
+%endif
%if ARCH_X86_64
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal psyCost_pp_16x16, 4, 10, 12
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+
+ mov r8d, 2
+.loopH:
+ mov r9d, 2
+.loopW:
+ PSY_PP_8x8_AVX2
+
+ paddd xm11, xm1
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_16x16, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -8642,9 +8808,33 @@
movd eax, xm13
RET
%endif
-
+%endif
%if ARCH_X86_64
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal psyCost_pp_32x32, 4, 10, 12
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+
+ mov r8d, 4
+.loopH:
+ mov r9d, 4
+.loopW:
+ PSY_PP_8x8_AVX2
+
+ paddd xm11, xm1
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_32x32, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -8669,9 +8859,33 @@
movd eax, xm13
RET
%endif
-
+%endif
%if ARCH_X86_64
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal psyCost_pp_64x64, 4, 10, 12
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+
+ mov r8d, 8
+.loopH:
+ mov r9d, 8
+.loopW:
+ PSY_PP_8x8_AVX2
+
+ paddd xm11, xm1
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_64x64, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
@@ -8696,6 +8910,7 @@
movd eax, xm13
RET
%endif
+%endif
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
More information about the x265-devel
mailing list