[x265] [PATCH 091 of 307] x86: AVX512 cleanup interp_4tap_horiz_pp_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:29 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504242228 -19800
#      Fri Sep 01 10:33:48 2017 +0530
# Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd
# Parent  d9200885420957bccd4edea62bf87bbe8831bc62
x86: AVX512 cleanup interp_4tap_horiz_pp_32xN

diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Sep 01 10:33:48 2017 +0530
@@ -4011,22 +4011,29 @@
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
+        //i444 chroma_hpp
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
-
         p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
-
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
 
+        //i422 chroma_hpp
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
+
+        //i420 chroma_hpp
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
+
         p.weight_pp = PFX(weight_pp_avx512);
 
         //i444 chroma_hps
diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Fri Sep 01 10:33:48 2017 +0530
@@ -150,8 +150,6 @@
 const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
 
-const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15
-
 ALIGN 64
 const interp8_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
 
@@ -9881,31 +9879,30 @@
     ; m9 - store shuffle order table
 
     movu              ym5,           [r0]
-    vinserti32x8       m5,           [r0 + 4], 1
+    vinserti32x8       m5,           [r0 + r1], 1
+    movu              ym7,           [r0 + 4]
+    vinserti32x8       m7,           [r0 + r1 + 4], 1
 
     pshufb             m6,           m5,       m2
-    pshufb             m5,           m5,       m1
+    pshufb             m5,           m1
+    pshufb             m8,           m7,       m2
+    pshufb             m7,           m1
+
     pmaddubsw          m5,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m7,           m3
+
     pmaddubsw          m6,           m0
-    pmaddwd            m5,           m3
+    pmaddubsw          m8,           m0
     pmaddwd            m6,           m3
-
-    movu              ym7,           [r0 + r1]
-    vinserti32x8       m7,           [r0 + r1 + 4], 1
-
-    pshufb             m8,           m7,       m2
-    pshufb             m7,           m7,       m1
-    pmaddubsw          m7,           m0
-    pmaddubsw          m8,           m0
-    pmaddwd            m7,           m3
     pmaddwd            m8,           m3
 
-    packssdw           m5,           m6
-    packssdw           m7,           m8
+    packssdw           m5,           m7
+    packssdw           m6,           m8
     pmulhrsw           m5,           m4
-    pmulhrsw           m7,           m4
-    packuswb           m5,           m7
-    vpermd             m5,           m9,           m5
+    pmulhrsw           m6,           m4
+    packuswb           m5,           m6
     movu             [r2],          ym5
     vextracti32x8    [r2 + r3],      m5,            1
 %endmacro
@@ -9947,7 +9944,7 @@
 
 %macro IPFILTER_CHROMA_PP_32xN_AVX512 1
 INIT_ZMM avx512
-cglobal interp_4tap_horiz_pp_32x%1, 4,6,10
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,9
     mov               r4d,               r4m
 
 %ifdef PIC
@@ -9959,7 +9956,6 @@
 
     vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
     vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
-    movu              m9,           [interp4_horiz_shuf_store1_avx512]
     vbroadcasti32x8   m3,           [pw_1]
     vbroadcasti32x8   m4,           [pw_512]
     dec               r0


More information about the x265-devel mailing list