[x265] [PATCH 06 of 12] asm: interp_4tap_vert_ps_8xN sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Mon May 18 04:48:57 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1431914024 25200
# Node ID 2a91f18790caee8c3b77838f04ae131acc2544b2
# Parent  4018cf6354c4524ec0d0409ade3de01e19f92364
asm: interp_4tap_vert_ps_8xN sse2

Converted vert_pp_8xN macro to also create ps primitives.  This replaces c code for ps with minimal impact on pp.

64-bit

./test/TestBench --testbench interp | grep vp | grep " 8x"
chroma_vpp[  8x8]	4.08x 	 2004.98  	 8188.26
chroma_vps[  8x8]	3.30x 	 1877.49  	 6197.96
chroma_vpp[ 8x16]	4.08x 	 3974.99  	 16231.35
chroma_vps[ 8x16]	3.30x 	 3729.98  	 12308.11
chroma_vpp[ 8x32]	4.07x 	 7885.22  	 32072.63
chroma_vps[ 8x32]	3.36x 	 7284.99  	 24442.68
chroma_vpp[ 8x16]	4.09x 	 3964.98  	 16230.44
chroma_vps[ 8x16]	3.35x 	 3677.49  	 12308.64
chroma_vpp[  8x8]	4.08x 	 2005.00  	 8187.65
chroma_vps[  8x8]	3.30x 	 1877.52  	 6199.94
chroma_vpp[ 8x32]	4.07x 	 7886.48  	 32099.09
chroma_vps[ 8x32]	3.28x 	 7417.49  	 24307.48
chroma_vpp[ 8x12]	4.10x 	 2994.99  	 12269.99
chroma_vps[ 8x12]	3.31x 	 2809.98  	 9307.72
chroma_vpp[ 8x64]	4.05x 	 15735.15 	 63743.21
chroma_vps[ 8x64]	3.30x 	 14640.09 	 48369.12
chroma_vpp[  8x8]	4.08x 	 2005.00  	 8187.79
chroma_vps[  8x8]	3.28x 	 1889.99  	 6198.50
chroma_vpp[ 8x16]	4.04x 	 4013.35  	 16231.04
chroma_vps[ 8x16]	3.33x 	 3692.69  	 12307.46
chroma_vpp[ 8x32]	4.06x 	 7894.98  	 32070.79
chroma_vps[ 8x32]	3.28x 	 7417.48  	 24307.55

diff -r 4018cf6354c4 -r 2a91f18790ca source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun May 17 18:41:31 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Sun May 17 18:53:44 2015 -0700
@@ -1524,8 +1524,19 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = x265_interp_4tap_vert_ps_8x2_sse2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_sse2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = x265_interp_4tap_vert_ps_8x6_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vps = x265_interp_4tap_vert_ps_6x16_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = x265_interp_4tap_vert_ps_8x12_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = x265_interp_4tap_vert_ps_8x64_sse2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_sse2;
 #endif
 
         ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
diff -r 4018cf6354c4 -r 2a91f18790ca source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Sun May 17 18:41:31 2015 -0700
+++ b/source/common/x86/ipfilter8.asm	Sun May 17 18:53:44 2015 -0700
@@ -1720,12 +1720,11 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W8_H8_H16_H32_sse2 2
 INIT_XMM sse2
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 11
-
+cglobal interp_4tap_vert_%1_8x%2, 4, 6, 11
     mov         r4d,       r4m
     sub         r0,        r1
     shl         r4d,       5
@@ -1740,7 +1739,13 @@
     mova        m5,        [tab_ChromaCoeff + r4 + 16]
 %endif
 
+%ifidn %1,pp
     mova        m4,        [pw_32]
+%elifidn %1,ps
+    mova        m4,        [pw_2000]
+    add         r3d,       r3d
+%endif
+
     lea         r5,        [r1 * 3]
 
 %assign x 1
@@ -1770,8 +1775,14 @@
     packssdw    m7,        m8
 
     paddw       m0,        m7
+
+%ifidn %1,pp
     paddw       m0,        m4
     psraw       m0,        6
+%elifidn %1,ps
+    psubw       m0,        m4
+    movu        [r2],      m0
+%endif
 
     lea         r0,        [r0 + 4 * r1]
     movq        m10,       [r0]
@@ -1793,12 +1804,18 @@
     packssdw    m7,        m8
 
     paddw       m1,        m7
+
+%ifidn %1,pp
     paddw       m1,        m4
     psraw       m1,        6
 
     packuswb    m0,        m1
     movh        [r2],      m0
     movhps      [r2 + r3], m0
+%elifidn %1,ps
+    psubw       m1,        m4
+    movu        [r2 + r3], m1
+%endif
 
     movq        m1,        [r0 + r1]
     punpcklbw   m10,       m1
@@ -1818,8 +1835,15 @@
     packssdw    m10,       m8
 
     paddw       m2,        m10
+    lea         r2,        [r2 + 2 * r3]
+
+%ifidn %1,pp
     paddw       m2,        m4
     psraw       m2,        6
+%elifidn %1,ps
+    psubw       m2,        m4
+    movu        [r2],      m2
+%endif
 
     movq        m7,        [r0 + 2 * r1]
     punpcklbw   m1,        m7
@@ -1839,13 +1863,19 @@
     packssdw    m1,        m8
 
     paddw       m3,        m1
+
+%ifidn %1,pp
     paddw       m3,        m4
     psraw       m3,        6
 
     packuswb    m2,        m3
-    lea         r2,        [r2 + 2 * r3]
     movh        [r2],      m2
     movhps      [r2 + r3], m2
+%elifidn %1,ps
+    psubw       m3,        m4
+    movu        [r2 + r3], m3
+%endif
+
 %if x < %2/4
     lea         r2,        [r2 + 2 * r3]
 %endif
@@ -1854,12 +1884,19 @@
 %endmacro
 
 %if ARCH_X86_64
-    FILTER_V4_W8_H8_H16_H32_sse2 8,  8
-    FILTER_V4_W8_H8_H16_H32_sse2 8, 16
-    FILTER_V4_W8_H8_H16_H32_sse2 8, 32
-
-    FILTER_V4_W8_H8_H16_H32_sse2 8, 12
-    FILTER_V4_W8_H8_H16_H32_sse2 8, 64
+    FILTER_V4_W8_H8_H16_H32_sse2 pp,  8
+    FILTER_V4_W8_H8_H16_H32_sse2 pp, 16
+    FILTER_V4_W8_H8_H16_H32_sse2 pp, 32
+
+    FILTER_V4_W8_H8_H16_H32_sse2 pp, 12
+    FILTER_V4_W8_H8_H16_H32_sse2 pp, 64
+
+    FILTER_V4_W8_H8_H16_H32_sse2 ps,  8
+    FILTER_V4_W8_H8_H16_H32_sse2 ps, 16
+    FILTER_V4_W8_H8_H16_H32_sse2 ps, 32
+
+    FILTER_V4_W8_H8_H16_H32_sse2 ps, 12
+    FILTER_V4_W8_H8_H16_H32_sse2 ps, 64
 %endif
 
 ;-----------------------------------------------------------------------------
diff -r 4018cf6354c4 -r 2a91f18790ca source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Sun May 17 18:41:31 2015 -0700
+++ b/source/common/x86/ipfilter8.h	Sun May 17 18:53:44 2015 -0700
@@ -959,6 +959,11 @@
 void x265_interp_4tap_vert_ps_8x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_vert_ps_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_vert_ps_8x6_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_8x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
 #endif
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS


More information about the x265-devel mailing list