[x265] [PATCH 1 of 2 updated] asm: new AVX2 version of satd_8x8 (509c -> 307c)
Min Chen
chenm003 at 163.com
Thu Mar 31 00:59:15 CEST 2016
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1459377415 18000
# Node ID ed0ed12c8359f77acdd85f7443bd4cd7bc1ba16e
# Parent 5dbd6a0c8e17481a0c4d31243ebc8b46ad59e15d
asm: new AVX2 version of satd_8x8 (509c -> 307c)
---
source/common/x86/asm-primitives.cpp | 4 ++
source/common/x86/pixel-a.asm | 74 ++++++++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 0 deletions(-)
diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Mar 28 12:53:40 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Mar 30 17:36:55 2016 -0500
@@ -2157,6 +2157,10 @@
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4]
+#if X265_DEPTH == 10
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
+#endif
+
if (cpuMask & X265_CPU_BMI2)
{
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Mar 28 12:53:40 2016 +0530
+++ b/source/common/x86/pixel-a.asm Wed Mar 30 17:36:55 2016 -0500
@@ -13799,3 +13799,77 @@
movzx eax, al
RET
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+
+
+%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
+%macro LOAD_DIFF_AVX2 4
+ movu %1, %3
+ movu %2, %4
+ psubw %1, %2
+%endmacro
+
+%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer
+ LOAD_DIFF_AVX2 xm%1, xm%5, [%8], [%9]
+ LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1], [%9+r3]
+ LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3]
+ LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4], [%9+r5]
+
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endmacro
+
+%macro SATD_8x4_AVX2 8-9
+ HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
+ paddw m%8, m%2
+ paddw m%8, m%4
+%endmacro
+
+INIT_YMM avx2
+cglobal pixel_satd_8x8, 4,4,7
+
+ FIX_STRIDES r1, r3
+ pxor xm6, xm6
+
+ ; load_diff 0 & 4
+ movu xm0, [r0]
+ movu xm1, [r2]
+ vinserti128 m0, m0, [r0 + r1 * 4], 1
+ vinserti128 m1, m1, [r2 + r3 * 4], 1
+ psubw m0, m1
+ add r0, r1
+ add r2, r3
+
+ ; load_diff 1 & 5
+ movu xm1, [r0]
+ movu xm2, [r2]
+ vinserti128 m1, m1, [r0 + r1 * 4], 1
+ vinserti128 m2, m2, [r2 + r3 * 4], 1
+ psubw m1, m2
+ add r0, r1
+ add r2, r3
+
+ ; load_diff 2 & 6
+ movu xm2, [r0]
+ movu xm3, [r2]
+ vinserti128 m2, m2, [r0 + r1 * 4], 1
+ vinserti128 m3, m3, [r2 + r3 * 4], 1
+ psubw m2, m3
+ add r0, r1
+ add r2, r3
+
+ ; load_diff 3 & 7
+ movu xm3, [r0]
+ movu xm4, [r2]
+ vinserti128 m3, m3, [r0 + r1 * 4], 1
+ vinserti128 m4, m4, [r2 + r3 * 4], 1
+ psubw m3, m4
+
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
+
+ vextracti128 xm0, m6, 1
+ paddw xm6, xm0
+ HADDUW xm6, xm0
+ movd eax, xm6
+ RET
+
+%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
More information about the x265-devel
mailing list