[x265] [PATCH 1 of 3] asm: AVX2 code for pixel_var primitive, improved over 40% than SSE
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Sep 10 14:03:53 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1441715051 -19800
# Tue Sep 08 17:54:11 2015 +0530
# Node ID 89c234e68523b05550b8c5197b83849544dc97d1
# Parent 365f7ed4d89628d49cd6af8d81d4edc01f73ffad
asm: AVX2 code for pixel_var primitive, improved over 40% than SSE
diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Sep 08 17:54:11 2015 +0530
@@ -2729,6 +2729,10 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2);
+ p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2);
+ p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2);
+
p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
p.planecopy_sp = PFX(downShift_16_avx2);
diff -r 365f7ed4d896 -r 89c234e68523 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Sep 08 17:54:11 2015 +0530
@@ -6397,6 +6397,78 @@
movd edx, xm6
%endif
RET
+
+INIT_YMM avx2
+cglobal pixel_var_32x32, 2,4,7
+ VAR_START 0
+ mov r2d, 16
+
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0 + 16]
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m4, [r0 + r1 + 16]
+
+ lea r0, [r0 + r1 * 2]
+
+ VAR_CORE
+
+ dec r2d
+ jg .loop
+
+ vextracti128 xm0, m5, 1
+ vextracti128 xm1, m6, 1
+ paddw xm5, xm0
+ paddd xm6, xm1
+ HADDW xm5, xm2
+ HADDD xm6, xm1
+
+%if ARCH_X86_64
+ punpckldq xm5, xm6
+ movq rax, xm5
+%else
+ movd eax, xm5
+ movd edx, xm6
+%endif
+ RET
+
+INIT_YMM avx2
+cglobal pixel_var_64x64, 2,4,7
+ VAR_START 0
+ mov r2d, 64
+
+.loop:
+ pmovzxbw m0, [r0]
+ pmovzxbw m3, [r0 + 16]
+ pmovzxbw m1, [r0 + mmsize]
+ pmovzxbw m4, [r0 + mmsize + 16]
+
+ lea r0, [r0 + r1]
+
+ VAR_CORE
+
+ dec r2d
+ jg .loop
+
+ pxor m1, m1
+ punpcklwd m0, m5, m1
+ punpckhwd m5, m1
+ paddd m5, m0
+ vextracti128 xm2, m5, 1
+ vextracti128 xm1, m6, 1
+ paddd xm5, xm2
+ paddd xm6, xm1
+ HADDD xm5, xm2
+ HADDD xm6, xm1
+
+%if ARCH_X86_64
+ punpckldq xm5, xm6
+ movq rax, xm5
+%else
+ movd eax, xm5
+ movd edx, xm6
+%endif
+ RET
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 3
More information about the x265-devel
mailing list