[x265] [PATCH 2 of 3] asm: AVX2 version of sa8d[32x32]

Min Chen chenm003 at 163.com
Tue Apr 12 19:31:01 CEST 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1460482248 18000
# Node ID 37e80d50caf51a74e85c83f24317935171a5d375
# Parent  40afead3177d7c128066334bfe075042388e86b0
asm: AVX2 version of sa8d[32x32]
AVX:
  sa8d[32x32]  5.47x    7403.68         40490.18

AVX2:
  sa8d[32x32]  10.57x   3783.80         40001.89
---
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/pixel-a.asm        |  369 ++++++++++++++++++++++++++++++++++
 2 files changed, 370 insertions(+), 0 deletions(-)

diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat Apr 09 19:32:28 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 12 12:30:48 2016 -0500
@@ -2161,6 +2161,7 @@
         p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
         p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
         p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
+        p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
 #endif
 
         if (cpuMask & X265_CPU_BMI2)
diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Sat Apr 09 19:32:28 2016 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Apr 12 12:30:48 2016 -0500
@@ -13995,4 +13995,373 @@
     shr  eax, 1
     RET
 
+
+; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block
+INIT_YMM avx2
+cglobal pixel_sa8d_32x32, 4,8,14
+    FIX_STRIDES r1, r3
+    lea  r4, [3*r1]
+    lea  r5, [3*r3]
+    lea  r6, [r0+4*r1]
+    lea  r7, [r2+4*r3]
+    vbroadcasti128 m7, [pw_1]
+
+
+    ;SA8D[16x8] ; pix[0]
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m10, m0, m2
+
+
+    ; SA8D[16x8] ; pix[16]
+    add  r0, mmsize
+    add  r2, mmsize
+    add  r6, mmsize
+    add  r7, mmsize
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m12, m0, m2
+
+
+    ; SA8D[16x8] ; pix[8*stride+16]
+    lea  r0, [r0+8*r1]
+    lea  r2, [r2+8*r3]
+    lea  r6, [r6+8*r1]
+    lea  r7, [r7+8*r3]
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m12, m0
+    paddd m12, m2
+
+    ; sum[1]
+    HADDD m12, m0
+
+
+    ; SA8D[16x8] ; pix[8*stride]
+    sub  r0, mmsize
+    sub  r2, mmsize
+    sub  r6, mmsize
+    sub  r7, mmsize
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m10, m0
+    paddd m10, m2
+
+    ; sum[0]
+    HADDD m10, m0
+    punpckldq xm10, xm12
+
+
+    ;SA8D[16x8] ; pix[16*stridr]
+    lea  r0, [r0+8*r1]
+    lea  r2, [r2+8*r3]
+    lea  r6, [r6+8*r1]
+    lea  r7, [r7+8*r3]
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m12, m0, m2
+
+
+    ; SA8D[16x8] ; pix[16*stride+16]
+    add  r0, mmsize
+    add  r2, mmsize
+    add  r6, mmsize
+    add  r7, mmsize
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m13, m0, m2
+
+
+    ; SA8D[16x8] ; pix[24*stride+16]
+    lea  r0, [r0+8*r1]
+    lea  r2, [r2+8*r3]
+    lea  r6, [r6+8*r1]
+    lea  r7, [r7+8*r3]
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m13, m0
+    paddd m13, m2
+
+    ; sum[3]
+    HADDD m13, m0
+
+
+    ; SA8D[16x8] ; pix[24*stride]
+    sub  r0, mmsize
+    sub  r2, mmsize
+    sub  r6, mmsize
+    sub  r7, mmsize
+
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+    movu m0, [r0]
+    movu m5, [r2]
+    psubw m0, m5
+    movu m1, [r0 + r1]
+    movu m6, [r2 + r3]
+    psubw m1, m6
+    movu m2, [r0 + r1 * 2]
+    movu m5, [r2 + r3 * 2]
+    psubw m2, m5
+    movu m8, [r0 + r4]
+    movu m6, [r2 + r5]
+    psubw m8, m6
+
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+    movu m4, [r6]
+    movu m11, [r7]
+    psubw m4, m11
+    movu m5, [r6 + r1]
+    movu m6, [r7 + r3]
+    psubw m5, m6
+    movu m3, [r6 + r1 * 2]
+    movu m11, [r7 + r3 * 2]
+    psubw m3, m11
+    movu m9, [r6 + r4]
+    movu m6, [r7 + r5]
+    psubw m9, m6
+
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+    paddw m0, m1
+    paddw m2, m8
+    pmaddwd m0, m7
+    pmaddwd m2, m7
+    paddd m12, m0
+    paddd m12, m2
+
+    ; sum[2]
+    HADDD m12, m0
+    punpckldq xm12, xm13
+
+    ; SA8D
+    punpcklqdq xm0, xm10, xm12
+    paddd xm0, [pd_1]
+    psrld xm0, 1
+    HADDD xm0, xm1
+
+    movd eax, xm0
+    RET
+
 %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10



More information about the x265-devel mailing list