[x265] [PATCH 235 of 307] x86: AVX512 interp_4tap_vert_ps_64xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:53 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1511781308 -19800
#      Mon Nov 27 16:45:08 2017 +0530
# Node ID 1cd123613bbb28fd00da36a3cfe3765f8e07d00e
# Parent  283aa4d77cef296699167c041763d7115e7a88aa
x86: AVX512 interp_4tap_vert_ps_64xN

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
64x16 |      39.17x       |      64.63x
64x32 |      40.14x       |      64.98x
64x48 |      39.97x       |      64.52x
64x64 |      40.32x       |      64.93x

diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 27 16:45:08 2017 +0530
@@ -5087,6 +5087,11 @@
         p.quant = PFX(quant_avx512);
         p.nquant = PFX(nquant_avx512);
         p.denoiseDct = PFX(denoise_dct_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
     }
 #endif
 }
diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Nov 27 16:45:08 2017 +0530
@@ -243,10 +243,13 @@
 const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
 
 ALIGN 64
+interp4_vps_store1_avx512:   dq 0, 1, 8, 9, 2, 3, 10, 11
+interp4_vps_store2_avx512:   dq 4, 5, 12, 13, 6, 7, 14, 15
 const interp4_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
 const interp4_hps_store_16xN_avx512,  dq 0, 2, 1, 3, 4, 6, 5, 7
 const interp8_hps_store_avx512,  dq 0, 1, 4, 5, 2, 3, 6, 7
 const interp8_vsp_store_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 cextern pb_128
 cextern pw_1
@@ -10864,7 +10867,7 @@
 %endif
 
 ;-------------------------------------------------------------------------------------------------------------
-;avx512 chroma_vpp code start
+;avx512 chroma_vpp and chroma_vps code start
 ;-------------------------------------------------------------------------------------------------------------
 %macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
     lea                   r5,                 [r0 + 4 * r1]
@@ -11157,7 +11160,7 @@
     RET
 %endif
 
-%macro PROCESS_CHROMA_VERT_PP_64x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
     movu              m0,              [r0]                        ; m0 = row 0
     movu              m1,              [r0 + r1]                   ; m1 = row 1
     punpcklbw         m2,              m0,                m1
@@ -11179,10 +11182,21 @@
     paddw             m2,              m8
     paddw             m3,              m9
 
+%ifidn %1,pp
     pmulhrsw          m2,              m12
     pmulhrsw          m3,              m12
     packuswb          m2,              m3
     movu              [r2],            m2
+%else
+    psubw             m2, m12
+    psubw             m3, m12
+    movu              m8, m13
+    movu              m9, m14
+    vpermi2q          m8, m2, m3
+    vpermi2q          m9, m2, m3
+    movu              [r2], m8
+    movu              [r2 + mmsize], m9
+%endif
 
     lea               r0,              [r0 + r1 * 4]
     movu              m0,              [r0]                        ; m0 = row 4
@@ -11194,10 +11208,22 @@
     pmaddubsw         m3,              m10
     paddw             m4,              m8
     paddw             m5,              m9
+
+%ifidn %1,pp
     pmulhrsw          m4,              m12
     pmulhrsw          m5,              m12
     packuswb          m4,              m5
     movu              [r2 + r3],       m4
+%else
+    psubw             m4, m12
+    psubw             m5, m12
+    movu              m8, m13
+    movu              m9, m14
+    vpermi2q          m8, m4, m5
+    vpermi2q          m9, m4, m5
+    movu              [r2 + r3], m8
+    movu              [r2 + r3 + mmsize], m9
+%endif
 
     movu              m1,              [r0 + r1]                   ; m1 = row 5
     punpcklbw         m4,              m0,                m1
@@ -11207,11 +11233,21 @@
     paddw             m6,              m4
     paddw             m7,              m5
 
+%ifidn %1,pp
     pmulhrsw          m6,              m12
     pmulhrsw          m7,              m12
     packuswb          m6,              m7
     movu              [r2 + r3 * 2],   m6
-
+%else
+    psubw             m6, m12
+    psubw             m7, m12
+    movu              m8, m13
+    movu              m9, m14
+    vpermi2q          m8, m6, m7
+    vpermi2q          m9, m6, m7
+    movu              [r2 + 2 * r3], m8
+    movu              [r2 + 2 * r3 + mmsize], m9
+%endif
     movu              m0,              [r0 + r1 * 2]               ; m0 = row 6
     punpcklbw         m6,              m1,                m0
     punpckhbw         m7,              m1,                m0
@@ -11219,16 +11255,27 @@
     pmaddubsw         m7,              m11
     paddw             m2,              m6
     paddw             m3,              m7
+
+%ifidn %1,pp
     pmulhrsw          m2,              m12
     pmulhrsw          m3,              m12
     packuswb          m2,              m3
     movu              [r2 + r5],       m2
-%endmacro
-
-%macro FILTER_VER_PP_CHROMA_AVX512_64xN 1
-%if ARCH_X86_64 == 1
+%else
+    psubw             m2, m12
+    psubw             m3, m12
+    movu              m8, m13
+    movu              m9, m14
+    vpermi2q          m8, m2, m3
+    vpermi2q          m9, m2, m3
+    movu              [r2 + r5], m8
+    movu              [r2 + r5 + mmsize], m9
+%endif
+%endmacro
+
+%macro FILTER_VER_CHROMA_AVX512_64xN 2
 INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_64x%1, 4, 6, 13
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 15
     mov               r4d,             r4m
     shl               r4d,             7
 
@@ -11241,26 +11288,39 @@
     mova              m11,             [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
 %endif
 
+%ifidn %1,pp
+    vbroadcasti32x8            m12, [pw_512]
+%else
+    add                        r3d, r3d
+    vbroadcasti32x8            m12, [pw_2000]
+    mova                       m13, [interp4_vps_store1_avx512]
+    mova                       m14, [interp4_vps_store2_avx512]
+%endif
     lea               r4,              [r1 * 3]
     sub               r0,              r1
-    vbroadcasti32x8   m12,             [pw_512]
     lea               r5,              [r3 * 3]
 
-%rep %1/4 - 1
-    PROCESS_CHROMA_VERT_PP_64x4_AVX512
+%rep %2/4 - 1
+    PROCESS_CHROMA_VERT_64x4_AVX512 %1
     lea               r2, [r2 + r3 * 4]
 %endrep
-    PROCESS_CHROMA_VERT_PP_64x4_AVX512
-    RET
-%endif
-%endmacro
-
-FILTER_VER_PP_CHROMA_AVX512_64xN 64
-FILTER_VER_PP_CHROMA_AVX512_64xN 48
-FILTER_VER_PP_CHROMA_AVX512_64xN 32
-FILTER_VER_PP_CHROMA_AVX512_64xN 16
-;-------------------------------------------------------------------------------------------------------------
-;avx512 chroma_vpp code end
+    PROCESS_CHROMA_VERT_64x4_AVX512 %1
+    RET
+%endmacro
+
+%if ARCH_X86_64 == 1
+FILTER_VER_CHROMA_AVX512_64xN pp, 64
+FILTER_VER_CHROMA_AVX512_64xN pp, 48
+FILTER_VER_CHROMA_AVX512_64xN pp, 32
+FILTER_VER_CHROMA_AVX512_64xN pp, 16
+
+FILTER_VER_CHROMA_AVX512_64xN ps, 64
+FILTER_VER_CHROMA_AVX512_64xN ps, 48
+FILTER_VER_CHROMA_AVX512_64xN ps, 32
+FILTER_VER_CHROMA_AVX512_64xN ps, 16
+%endif
+;-------------------------------------------------------------------------------------------------------------
+;avx512 chroma_vpp and chroma_vps code end
 ;-------------------------------------------------------------------------------------------------------------
 ;-------------------------------------------------------------------------------------------------------------
 ;avx512 chroma_vss code start


More information about the x265-devel mailing list