[x265] [PATCH] asm: avx2 code for high_bit_depth psyCost_pp_4x4, reduced 400c->250c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue May 12 08:20:18 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431344809 -19800
# Mon May 11 17:16:49 2015 +0530
# Node ID b64b19125ced7e0f2de57e190e5e14be274e0d7e
# Parent 4109cf92731a8a6cfe35019d205476e8719d4c67
asm: avx2 code for high_bit_depth psyCost_pp_4x4, reduced 400c->250c
diff -r 4109cf92731a -r b64b19125ced source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 12 11:13:47 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon May 11 17:16:49 2015 +0530
@@ -1181,6 +1181,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
+
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
diff -r 4109cf92731a -r b64b19125ced source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue May 12 11:13:47 2015 +0530
+++ b/source/common/x86/pixel-a.asm Mon May 11 17:16:49 2015 +0530
@@ -8260,6 +8260,76 @@
%endif
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal psyCost_pp_4x4, 4, 5, 6
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [r1 * 3]
+ movddup xm0, [r0]
+ movddup xm1, [r0 + r1]
+ movddup xm2, [r0 + r1 * 2]
+ movddup xm3, [r0 + r4]
+
+ lea r4, [r3 * 3]
+ movddup xm4, [r2]
+ movddup xm5, [r2 + r3]
+ vinserti128 m0, m0, xm4, 1
+ vinserti128 m1, m1, xm5, 1
+ movddup xm4, [r2 + r3 * 2]
+ movddup xm5, [r2 + r4]
+ vinserti128 m2, m2, xm4, 1
+ vinserti128 m3, m3, xm5, 1
+
+ mova m4, [hmul_8w]
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+ paddd m5, m0, m1
+ paddd m4, m2, m3
+ paddd m5, m4
+ psrldq m4, m5, 4
+ paddd m5, m4
+ psrld m5, 2
+
+ mova m4, m0
+ paddd m0, m1
+ psubd m1, m4
+ mova m4, m2
+ paddd m2, m3
+ psubd m3, m4
+ mova m4, m0
+ paddd m0, m2
+ psubd m2, m4
+ mova m4, m1
+ paddd m1, m3
+ psubd m3, m4
+ movaps m4, m0
+ vshufps m4, m4, m2, 11011101b
+ vshufps m0, m0, m2, 10001000b
+ movaps m2, m1
+ vshufps m2, m2, m3, 11011101b
+ vshufps m1, m1, m3, 10001000b
+ pabsd m0, m0
+ pabsd m4, m4
+ pmaxsd m0, m4
+ pabsd m1, m1
+ pabsd m2, m2
+ pmaxsd m1, m2
+ paddd m0, m1
+
+ vpermq m1, m0, 11110101b
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ psubd m0, m5
+
+ vextracti128 xm1, m0, 1
+ psubd xm1, xm0
+ pabsd xm1, xm1
+ movd eax, xm1
+ RET
+%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_4x4, 4, 5, 6
lea r4, [3 * r1]
movd xm0, [r0]
@@ -8314,6 +8384,7 @@
pabsd m1, m1
movd eax, xm1
RET
+%endif
%macro PSY_PP_8x8 0
movddup m0, [r0 + r1 * 0]
More information about the x265-devel
mailing list