[x265] [PATCH 1 of 2 updated] asm: new AVX2 version of satd_8x8 (509c -> 307c)

Min Chen chenm003 at 163.com
Thu Mar 31 00:59:15 CEST 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1459377415 18000
# Node ID ed0ed12c8359f77acdd85f7443bd4cd7bc1ba16e
# Parent  5dbd6a0c8e17481a0c4d31243ebc8b46ad59e15d
asm: new AVX2 version of satd_8x8 (509c -> 307c)
---
 source/common/x86/asm-primitives.cpp |    4 ++
 source/common/x86/pixel-a.asm        |   74 ++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 0 deletions(-)

diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Mar 28 12:53:40 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Mar 30 17:36:55 2016 -0500
@@ -2157,6 +2157,10 @@
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
 
+#if X265_DEPTH == 10
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
+#endif
+
         if (cpuMask & X265_CPU_BMI2)
         {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Mar 28 12:53:40 2016 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Mar 30 17:36:55 2016 -0500
@@ -13799,3 +13799,77 @@
     movzx           eax, al
     RET
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+
+
+%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
+%macro LOAD_DIFF_AVX2 4
+    movu       %1, %3
+    movu       %2, %4
+    psubw      %1, %2
+%endmacro
+
+%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer
+    LOAD_DIFF_AVX2 xm%1, xm%5, [%8],      [%9]
+    LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1],   [%9+r3]
+    LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3]
+    LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4],   [%9+r5]
+
+    lea %8, [%8+4*r1]
+    lea %9, [%9+4*r3]
+%endmacro
+
+%macro SATD_8x4_AVX2 8-9
+    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
+    paddw m%8, m%2
+    paddw m%8, m%4
+%endmacro
+
+INIT_YMM avx2
+cglobal pixel_satd_8x8, 4,4,7
+
+    FIX_STRIDES r1, r3
+    pxor    xm6, xm6
+
+    ; load_diff 0 & 4
+    movu    xm0, [r0]
+    movu    xm1, [r2]
+    vinserti128 m0, m0, [r0 + r1 * 4], 1
+    vinserti128 m1, m1, [r2 + r3 * 4], 1
+    psubw   m0, m1
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 1 & 5
+    movu    xm1, [r0]
+    movu    xm2, [r2]
+    vinserti128 m1, m1, [r0 + r1 * 4], 1
+    vinserti128 m2, m2, [r2 + r3 * 4], 1
+    psubw   m1, m2
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 2 & 6
+    movu    xm2, [r0]
+    movu    xm3, [r2]
+    vinserti128 m2, m2, [r0 + r1 * 4], 1
+    vinserti128 m3, m3, [r2 + r3 * 4], 1
+    psubw   m2, m3
+    add     r0, r1
+    add     r2, r3
+
+    ; load_diff 3 & 7
+    movu    xm3, [r0]
+    movu    xm4, [r2]
+    vinserti128 m3, m3, [r0 + r1 * 4], 1
+    vinserti128 m4, m4, [r2 + r3 * 4], 1
+    psubw   m3, m4
+
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
+
+    vextracti128 xm0, m6, 1
+    paddw xm6, xm0
+    HADDUW xm6, xm0
+    movd   eax, xm6
+    RET
+
+%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10



More information about the x265-devel mailing list