[x265] [PATCH 250 of 307] x86: pixel_satd_16xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:08 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512648175 -19800
#      Thu Dec 07 17:32:55 2017 +0530
# Node ID 86d3d34de566d7696028b5e798a79b9de3a6e62b
# Parent  617aa7cf2c76368cb8a3b252175c1b3d6f716915
x86: pixel_satd_16xN for high bit depth

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
16x8  |       9.62x       |      14.03x
16x16 |      12.07x       |      13.57x
16x32 |      12.82x       |      16.03x
16x64 |      12.92x       |      15.76x

This patch also cleanup existing satd AVX512 code

diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 08 14:12:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 07 17:32:55 2017 +0530
@@ -3026,7 +3026,10 @@
         p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
         //Luma_hps_48x64
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
-
+        p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
+        p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
+        p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx512);
+        p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx512);
         p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
         p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
         p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
@@ -3036,11 +3039,17 @@
         p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
         p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
         p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
-
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Fri Dec 08 14:12:55 2017 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Dec 07 17:32:55 2017 +0530
@@ -8227,7 +8227,7 @@
     pmaxsw    m%1, m%3
     pmaxsw    m%2, m%4
 %endmacro
-
+%if HIGH_BIT_DEPTH==0
 INIT_ZMM avx512
 cglobal pixel_satd_16x8_internal
     vbroadcasti64x4 m6, [hmul_16p]
@@ -8381,7 +8381,7 @@
     SUMSUB_BA      w, 0, 1, 2
     HMAXABSW2      0, 1, 2, 3
     SATD_AVX512_END 1
-
+%endif
 ; Input 10bit, Output 8bit
 ;------------------------------------------------------------------------------------------------------------------------
 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
@@ -13971,82 +13971,31 @@
     paddd           xm6, xm7
     movd            eax, xm6
 %endmacro
-
-%macro PROCESS_SATD_32x8_HBD_AVX512 0        ; function to compute satd cost for 32 columns, 8 rows
+%macro PROCESS_SATD_16x8_HBD_AVX512 0        ; function to compute satd cost for 16 columns, 8 rows
     ; rows 0-3
-    movu            m0, [r0]
-    movu            m4, [r2]
+    lea             r6, [r0 + r1 * 4]
+    lea             r7, [r2 + r3 * 4]
+    movu            ym0, [r0]
+    movu            ym4, [r2]
+    vinserti32x8    m0, [r6], 1
+    vinserti32x8    m4, [r7], 1
     psubw           m0, m4
-    movu            m1, [r0 + r1]
-    movu            m5, [r2 + r3]
+    movu            ym1, [r0 + r1]
+    movu            ym5, [r2 + r3]
+    vinserti32x8    m1, [r6 + r1], 1
+    vinserti32x8    m5, [r7 + r3], 1
     psubw           m1, m5
-    movu            m2, [r0 + r1 * 2]
-    movu            m4, [r2 + r3 * 2]
+    movu            ym2, [r0 + r1 * 2]
+    movu            ym4, [r2 + r3 * 2]
+    vinserti32x8    m2, [r6 + r1 * 2], 1
+    vinserti32x8    m4, [r7 + r3 * 2], 1
     psubw           m2, m4
-    movu            m3, [r0 + r4]
-    movu            m5, [r2 + r5]
+    movu            ym3, [r0 + r4]
+    movu            ym5, [r2 + r5]
+    vinserti32x8    m3, [r6 + r4], 1
+    vinserti32x8    m5, [r7 + r5], 1
     psubw           m3, m5
-    lea             r0, [r0 + r1 * 4]
-    lea             r2, [r2 + r3 * 4]
-    paddw           m4, m0, m1
-    psubw           m1, m0
-    paddw           m0, m2, m3
-    psubw           m3, m2
-    punpckhwd       m2, m4, m1
-    punpcklwd       m4, m1
-    punpckhwd       m1, m0, m3
-    punpcklwd       m0, m3
-    paddw           m3, m4, m0
-    psubw           m0, m4
-    paddw           m4, m2, m1
-    psubw           m1, m2
-    punpckhdq       m2, m3, m0
-    punpckldq       m3, m0
-    paddw           m0, m3, m2
-    psubw           m2, m3
-    punpckhdq       m3, m4, m1
-    punpckldq       m4, m1
-    paddw           m1, m4, m3
-    psubw           m3, m4
-    punpckhqdq      m4, m0, m1
-    punpcklqdq      m0, m1
-    pabsw           m0, m0
-    pabsw           m4, m4
-    pmaxsw          m0, m0, m4
-    punpckhqdq      m1, m2, m3
-    punpcklqdq      m2, m3
-    pabsw           m2, m2
-    pabsw           m1, m1
-    pmaxsw          m2, m1
-    pxor            m7, m7
-    mova            m1, m0
-    punpcklwd       m1, m7
-    paddd           m6, m1
-    mova            m1, m0
-    punpckhwd       m1, m7
-    paddd           m6, m1
-    pxor            m7, m7
-    mova            m1, m2
-    punpcklwd       m1, m7
-    paddd           m6, m1
-    mova            m1, m2
-    punpckhwd       m1, m7
-    paddd           m6, m1
-    ; rows 4-7
-    movu            m0, [r0]
-    movu            m4, [r2]
-    psubw           m0, m4
-    movu            m1, [r0 + r1]
-    movu            m5, [r2 + r3]
-    psubw           m1, m5
-    movu            m2, [r0 + r1 * 2]
-    movu            m4, [r2 + r3 * 2]
-    psubw           m2, m4
-    movu            m3, [r0 + r4]
-    movu            m5, [r2 + r5]
-    psubw           m3, m5
-    lea             r0, [r0 + r1 * 4]
-    lea             r2, [r2 + r3 * 4]
+
     paddw           m4, m0, m1
     psubw           m1, m0
     paddw           m0, m2, m3
@@ -14092,6 +14041,89 @@
     punpckhwd       m1, m7
     paddd           m6, m1
 %endmacro
+%macro PROCESS_SATD_32x4_HBD_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
+    ; rows 0-3
+    movu            m0, [r0]
+    movu            m4, [r2]
+    psubw           m0, m4
+    movu            m1, [r0 + r1]
+    movu            m5, [r2 + r3]
+    psubw           m1, m5
+    movu            m2, [r0 + r1 * 2]
+    movu            m4, [r2 + r3 * 2]
+    psubw           m2, m4
+    movu            m3, [r0 + r4]
+    movu            m5, [r2 + r5]
+    psubw           m3, m5
+    paddw           m4, m0, m1
+    psubw           m1, m0
+    paddw           m0, m2, m3
+    psubw           m3, m2
+    punpckhwd       m2, m4, m1
+    punpcklwd       m4, m1
+    punpckhwd       m1, m0, m3
+    punpcklwd       m0, m3
+    paddw           m3, m4, m0
+    psubw           m0, m4
+    paddw           m4, m2, m1
+    psubw           m1, m2
+    punpckhdq       m2, m3, m0
+    punpckldq       m3, m0
+    paddw           m0, m3, m2
+    psubw           m2, m3
+    punpckhdq       m3, m4, m1
+    punpckldq       m4, m1
+    paddw           m1, m4, m3
+    psubw           m3, m4
+    punpckhqdq      m4, m0, m1
+    punpcklqdq      m0, m1
+    pabsw           m0, m0
+    pabsw           m4, m4
+    pmaxsw          m0, m0, m4
+    punpckhqdq      m1, m2, m3
+    punpcklqdq      m2, m3
+    pabsw           m2, m2
+    pabsw           m1, m1
+    pmaxsw          m2, m1
+    pxor            m7, m7
+    mova            m1, m0
+    punpcklwd       m1, m7
+    paddd           m6, m1
+    mova            m1, m0
+    punpckhwd       m1, m7
+    paddd           m6, m1
+    pxor            m7, m7
+    mova            m1, m2
+    punpcklwd       m1, m7
+    paddd           m6, m1
+    mova            m1, m2
+    punpckhwd       m1, m7
+    paddd           m6, m1
+%endmacro
+
+%macro SATD_16xN_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_16x%1, 4,8,8
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+
+%rep %1/8 - 1
+    PROCESS_SATD_16x8_HBD_AVX512
+    lea             r0, [r6 + 4 * r1]
+    lea             r2, [r7 + 4 * r3]
+%endrep
+    PROCESS_SATD_16x8_HBD_AVX512
+    SATD_HBD_AVX512_END
+    RET
+%endmacro
+
+SATD_16xN_HBD_AVX512 8
+SATD_16xN_HBD_AVX512 16
+SATD_16xN_HBD_AVX512 32
+SATD_16xN_HBD_AVX512 64
 
 %macro SATD_32xN_HBD_AVX512 1
 INIT_ZMM avx512
@@ -14103,10 +14135,12 @@
     pxor            m6, m6
     mov             r6, r0
     mov             r7, r2
-
-%rep %1/8
-    PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+    PROCESS_SATD_32x4_HBD_AVX512
+    lea             r0, [r0 + 4 * r1]
+    lea             r2, [r2 + 4 * r3]
 %endrep
+    PROCESS_SATD_32x4_HBD_AVX512
     SATD_HBD_AVX512_END
     RET
 %endmacro
@@ -14127,15 +14161,20 @@
     pxor            m6, m6
     mov             r6, r0
     mov             r7, r2
-
-%rep %1/8
-    PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+    PROCESS_SATD_32x4_HBD_AVX512
+    lea             r0, [r0 + 4 * r1]
+    lea             r2, [r2 + 4 * r3]
 %endrep
+    PROCESS_SATD_32x4_HBD_AVX512
     lea             r0, [r6 + mmsize]
     lea             r2, [r7 + mmsize]
-%rep %1/8
-    PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+    PROCESS_SATD_32x4_HBD_AVX512
+    lea             r0, [r0 + 4 * r1]
+    lea             r2, [r2 + 4 * r3]
 %endrep
+    PROCESS_SATD_32x4_HBD_AVX512
     SATD_HBD_AVX512_END
     RET
 %endmacro


More information about the x265-devel mailing list