[x265] [PATCH] asm: psyCost_pp avx2 code for BLOCK_4x4
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Mar 24 07:24:54 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427122202 -19800
# Mon Mar 23 20:20:02 2015 +0530
# Node ID 32c2596f4716520fd52d1685b5e2be176b5bc08c
# Parent c6d268fedc36710dc77301c45816246a09f10ad7
asm: psyCost_pp avx2 code for BLOCK_4x4
AVX2:
psy_cost_pp[4x4] 10.30x 216.56 2230.77
SSE4:
psy_cost_pp[4x4] 6.53x 352.01 2297.35
diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 23 20:20:02 2015 +0530
@@ -1417,6 +1417,7 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_avx2;
p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_avx2;
p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_avx2;
diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/pixel-a.asm Mon Mar 23 20:20:02 2015 +0530
@@ -38,7 +38,7 @@
times 4 db 1, -1
times 8 db 1
times 4 db 1, -1
-hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
hmul_8w: times 4 dw 1
@@ -8146,6 +8146,62 @@
%endif ; HIGH_BIT_DEPTH
%endif
+INIT_YMM avx2
+cglobal psyCost_pp_4x4, 4, 5, 6
+ lea r4, [3 * r1]
+ movd xm0, [r0]
+ movd xm1, [r0 + r1]
+ movd xm2, [r0 + r1 * 2]
+ movd xm3, [r0 + r4]
+ vshufps xm0, xm1, 0
+ vshufps xm2, xm3, 0
+
+ lea r4, [3 * r3]
+ movd xm1, [r2]
+ movd xm3, [r2 + r3]
+ movd xm4, [r2 + r3 * 2]
+ movd xm5, [r2 + r4]
+ vshufps xm1, xm3, 0
+ vshufps xm4, xm5, 0
+
+ vinserti128 m0, m0, xm1, 1
+ vinserti128 m2, m2, xm4, 1
+
+ mova m4, [hmul_4p]
+ pmaddubsw m0, m4
+ pmaddubsw m2, m4
+
+ paddw m5, m0, m2
+ mova m1, m5
+ psrldq m4, m5, 8
+ paddw m5, m4
+ pmaddwd m5, [pw_1]
+ psrld m5, 2
+
+ vpsubw m2, m2, m0
+ vpunpckhqdq m0, m1, m2
+ vpunpcklqdq m1, m1, m2
+ vpaddw m2, m1, m0
+ vpsubw m0, m0, m1
+ vpblendw m1, m2, m0, 10101010b
+ vpslld m0, m0, 10h
+ vpsrld m2, m2, 10h
+ vpor m0, m0, m2
+ vpabsw m1, m1
+ vpabsw m0, m0
+ vpmaxsw m1, m1, m0
+ vpmaddwd m1, m1, [pw_1]
+ psrldq m2, m1, 8
+ paddd m1, m2
+ psrldq m3, m1, 4
+ paddd m1, m3
+ psubd m1, m5
+ vextracti128 xm2, m1, 1
+ psubd m1, m2
+ pabsd m1, m1
+ movd eax, xm1
+ RET
+
%macro PSY_PP_8x8 0
movddup m0, [r0 + r1 * 0]
movddup m1, [r0 + r1 * 1]
diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/pixel.h Mon Mar 23 20:20:02 2015 +0530
@@ -260,6 +260,7 @@
void x265_pixel_sub_ps_32x32_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_64x64_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+int x265_psyCost_pp_4x4_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_8x8_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_16x16_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_pp_32x32_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
More information about the x265-devel
mailing list