[x265] [PATCH 290 of 307] x86: AVX512 pixel_satd_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:48 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515745345 -19800
#      Fri Jan 12 13:52:25 2018 +0530
# Node ID 1c2875198a213a5f8d84bff57fcec15727f94a4f
# Parent  d43237051962eab3cd761cf24f3971de09c07aa5
x86: AVX512 pixel_satd_32xN

Size   | AVX2 performance | AVX512 performance
-----------------------------------------------
32x8   |    10.34x         |    12.26x
32x16  |    10.21x         |    12.40x
32x24  |    10.47x         |    13.23x
32x32  |    10.55x         |    12.46x
32x48  |    10.60x         |    12.59x
32x64  |    10.56x         |    12.65x

diff -r d43237051962 -r 1c2875198a21 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jan 11 14:31:13 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jan 12 13:52:25 2018 +0530
@@ -5342,6 +5342,20 @@
         p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
         p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
         p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+        p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
+        p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
+        p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
+        p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
+        p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
 
     }
 #endif
diff -r d43237051962 -r 1c2875198a21 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Jan 11 14:31:13 2018 +0530
+++ b/source/common/x86/pixel-a.asm	Fri Jan 12 13:52:25 2018 +0530
@@ -14068,8 +14068,102 @@
     paddd           xm0, xm1
     movd            eax, xm0
     RET
+
+%macro PROCESS_SATD_32x4_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
+    ; rows 0-3
+    pmovzxbw         m0, [r0]
+    pmovzxbw         m4, [r2]
+    psubw           m0, m4
+    pmovzxbw         m1, [r0 + r1]
+    pmovzxbw         m5, [r2 + r3]
+    psubw           m1, m5
+    pmovzxbw         m2, [r0 + r1 * 2]
+    pmovzxbw         m4, [r2 + r3 * 2]
+    psubw           m2, m4
+    pmovzxbw         m3, [r0 + r4]
+    pmovzxbw         m5, [r2 + r5]
+    psubw           m3, m5
+    paddw           m4, m0, m1
+    psubw           m1, m0
+    paddw           m0, m2, m3
+    psubw           m3, m2
+    punpckhwd       m2, m4, m1
+    punpcklwd       m4, m1
+    punpckhwd       m1, m0, m3
+    punpcklwd       m0, m3
+    paddw           m3, m4, m0
+    psubw           m0, m4
+    paddw           m4, m2, m1
+    psubw           m1, m2
+    punpckhdq       m2, m3, m0
+    punpckldq       m3, m0
+    paddw           m0, m3, m2
+    psubw           m2, m3
+    punpckhdq       m3, m4, m1
+    punpckldq       m4, m1
+    paddw           m1, m4, m3
+    psubw           m3, m4
+    punpckhqdq      m4, m0, m1
+    punpcklqdq      m0, m1
+    pabsw           m0, m0
+    pabsw           m4, m4
+    pmaxsw          m0, m0, m4
+    punpckhqdq      m1, m2, m3
+    punpcklqdq      m2, m3
+    pabsw           m2, m2
+    pabsw           m1, m1
+    pmaxsw          m2, m1
+    pxor            m7, m7
+    mova            m1, m0
+    punpcklwd       m1, m7
+    paddd           m6, m1
+    mova            m1, m0
+    punpckhwd       m1, m7
+    paddd           m6, m1
+    pxor            m7, m7
+    mova            m1, m2
+    punpcklwd       m1, m7
+    paddd           m6, m1
+    mova            m1, m2
+    punpckhwd       m1, m7
+    paddd           m6, m1
+%endmacro
+
+%macro SATD_MAIN_AVX512_END 0
+    vextracti32x8   ym7,   m6,   1
+    paddd           ym6,   ym7
+    vextracti128    xm7,   ym6,  1
+    paddd           xm6,   xm6,  xm7
+    punpckhqdq      xm7,   xm6,  xm6
+    paddd           xm6,   xm7
+    movq            rax,   xm6
+    rorx            rdx,   rax,  32
+    add             eax,   edx
+%endmacro
+
+%macro SATD_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_32x%1, 4,6,8
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+%rep %1/4 - 1
+    PROCESS_SATD_32x4_AVX512
+    lea             r0, [r0 + 4 * r1]
+    lea             r2, [r2 + 4 * r3]
+%endrep
+    PROCESS_SATD_32x4_AVX512
+    SATD_MAIN_AVX512_END
+    RET
+%endmacro
+
+SATD_32xN_AVX512 8
+SATD_32xN_AVX512 16
+SATD_32xN_AVX512 24
+SATD_32xN_AVX512 32
+SATD_32xN_AVX512 48
+SATD_32xN_AVX512 64
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
-
 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
 INIT_YMM avx2
 cglobal calc_satd_16x8    ; function to compute satd cost for 16 columns, 8 rows


More information about the x265-devel mailing list