[x265-commits] [x265] asm: interp_8tap_horiz pp and ps sse2

David T Yuen dtyx265 at gmail.com
Sat May 2 05:52:44 CEST 2015


details:   http://hg.videolan.org/x265/rev/8dd53df88421
branches:  
changeset: 10352:8dd53df88421
user:      David T Yuen <dtyx265 at gmail.com>
date:      Thu Apr 30 05:59:45 2015 -0700
description:
asm: interp_8tap_horiz pp and ps sse2

This replaces c code and covers

4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32, 16x64,
24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64

64-bit

./test/TestBench --testbench interp | grep luma_h
luma_hpp[  4x4]		1.93x 	 1785.29  	 3440.28
luma_hps[  4x4]		1.85x 	 4487.96  	 8320.21
luma_hpp[  8x8]		2.22x 	 6005.12  	 13357.72
luma_hps[  8x8]		1.93x 	 10942.82 	 21135.69
luma_hpp[16x16]		2.55x 	 23903.86 	 61070.57
luma_hps[16x16]		2.30x 	 32845.00 	 75494.60
luma_hpp[32x32]		2.55x 	 94419.45 	 241073.86
luma_hps[32x32]		2.21x 	 110782.46 	 244683.78
luma_hpp[64x64]		2.53x 	 376337.31 	 951570.12
luma_hps[64x64]		2.30x 	 402073.75 	 924435.56
luma_hpp[  8x4]		2.37x 	 3028.20  	 7175.86
luma_hps[  8x4]		1.92x 	 8122.82  	 15575.28
luma_hpp[  4x8]		1.94x 	 3562.81  	 6902.89
luma_hps[  4x8]		1.85x 	 6087.62  	 11235.28
luma_hpp[ 16x8]		2.57x 	 11910.06 	 30555.35
luma_hps[ 16x8]		2.27x 	 21564.17 	 48875.00
luma_hpp[ 8x16]		2.26x 	 12018.65 	 27189.64
luma_hps[ 8x16]		1.96x 	 16582.61 	 32435.16
luma_hpp[32x16]		2.55x 	 47313.87 	 120654.91
luma_hps[32x16]		2.21x 	 65361.34 	 144196.59
luma_hpp[16x32]		2.55x 	 47465.93 	 121177.02
luma_hps[16x32]		2.28x 	 55633.76 	 126907.55
luma_hpp[64x32]		2.53x 	 187898.22 	 475918.91
luma_hps[64x32]		2.30x 	 221152.92 	 508105.84
luma_hpp[32x64]		2.55x 	 188351.70 	 481050.72
luma_hps[32x64]		2.21x 	 201352.56 	 445565.69
luma_hpp[16x12]		2.54x 	 18025.36 	 45705.77
luma_hps[16x12]		2.27x 	 27205.93 	 61835.00
luma_hpp[12x16]		2.35x 	 18920.75 	 44486.00
luma_hps[12x16]		1.88x 	 25562.62 	 48125.28
luma_hpp[ 16x4]		2.55x 	 5973.02  	 15213.14
luma_hps[ 16x4]		2.27x 	 15877.92 	 36108.43
luma_hpp[ 4x16]		2.07x 	 7217.57  	 14942.64
luma_hps[ 4x16]		1.87x 	 9127.50  	 17075.56
luma_hpp[32x24]		2.54x 	 70851.95 	 179641.31
luma_hps[32x24]		2.21x 	 88052.20 	 194443.84
luma_hpp[24x32]		2.56x 	 70742.67 	 181290.56
luma_hps[24x32]		2.23x 	 83209.55 	 185356.34
luma_hpp[ 32x8]		2.54x 	 23639.81 	 60057.16
luma_hps[ 32x8]		2.20x 	 42754.99 	 94160.41
luma_hpp[ 8x32]		2.44x 	 23742.78 	 57819.51
luma_hps[ 8x32]		1.96x 	 27986.91 	 54777.17
luma_hpp[64x48]		2.53x 	 281572.38 	 712128.56
luma_hps[64x48]		2.29x 	 312399.41 	 715907.50
luma_hpp[48x64]		2.53x 	 281742.69 	 712628.06
luma_hps[48x64]		2.19x 	 301655.44 	 661646.25
luma_hpp[64x16]		2.52x 	 94103.24 	 237202.28
luma_hps[64x16]		2.30x 	 130542.85 	 299616.22
luma_hpp[16x64]		2.57x 	 94735.02 	 243127.64
luma_hps[16x64]		2.29x 	 100840.45 	 230957.56
Subject: [x265] asm: interp_8tap_hv_pp_8x8 sse3

details:   http://hg.videolan.org/x265/rev/57f8246c759d
branches:  
changeset: 10353:57f8246c759d
user:      David T Yuen <dtyx265 at gmail.com>
date:      Wed Apr 29 19:40:08 2015 -0700
description:
asm: interp_8tap_hv_pp_8x8 sse3

This replaces c code

64-bit

./test/TestBench --testbench interp | grep hv
luma_hv [  8x8]		2.53x 	 14225.03 	 35970.65

32-bit

./test/TestBench --testbench interp | grep hv
luma_hv [  8x8]		2.50x 	 14367.40 	 35917.48

diffstat:

 source/common/x86/asm-primitives.cpp |    6 +
 source/common/x86/ipfilter8.asm      |  304 +++++++++++++++++++++++++++++++++++
 source/common/x86/ipfilter8.h        |   51 +++++
 3 files changed, 361 insertions(+), 0 deletions(-)

diffs (truncated from 405 to 300 lines):

diff -r 94e9c3464c49 -r 57f8246c759d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri May 01 14:56:01 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 29 19:40:08 2015 -0700
@@ -1343,6 +1343,12 @@ void setupAssemblyPrimitives(EncoderPrim
         CHROMA_422_VSP_FILTERS(_sse2);
         CHROMA_444_VSP_FILTERS(_sse2);
 
+        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
+        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
+        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
+        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+        p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
+
         //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
 
diff -r 94e9c3464c49 -r 57f8246c759d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri May 01 14:56:01 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 29 19:40:08 2015 -0700
@@ -151,6 +151,11 @@ const tab_LumaCoeff,   db   0, 0,  0,  6
                        db  -1, 4, -11, 40,  40, -11, 4, -1
                        db   0, 1, -5,  17,  58, -10, 4, -1
 
+const tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
+                       dw  -1, 4, -10, 58,  17, -5,  1,  0
+                       dw  -1, 4, -11, 40,  40, -11, 4, -1
+                       dw   0, 1, -5,  17,  58, -10, 4, -1
+
 const tab_LumaCoeffV,   times 4 dw 0, 0
                         times 4 dw 0, 64
                         times 4 dw 0, 0
@@ -807,6 +812,233 @@ cglobal interp_4tap_horiz_pp_%1x%2, 4, 6
     IPFILTER_CHROMA_W_sse3 48, 64
     IPFILTER_CHROMA_W_sse3 64, 16
 
+%macro FILTER_H8_W8_sse2 0
+    movh        m1, [r0 + x - 3]
+    movh        m4, [r0 + x - 2]
+    punpcklbw   m1, m6
+    punpcklbw   m4, m6
+    movh        m5, [r0 + x - 1]
+    movh        m0, [r0 + x]
+    punpcklbw   m5, m6
+    punpcklbw   m0, m6
+    pmaddwd     m1, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    pmaddwd     m0, m3
+    packssdw    m1, m4
+    packssdw    m5, m0
+    pshuflw     m4, m1, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m5, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m4
+    paddw       m5, m0
+    psrldq      m1, 2
+    psrldq      m5, 2
+    pshufd      m1, m1, q3120
+    pshufd      m5, m5, q3120
+    punpcklqdq  m1, m5
+    movh        m7, [r0 + x + 1]
+    movh        m4, [r0 + x + 2]
+    punpcklbw   m7, m6
+    punpcklbw   m4, m6
+    movh        m5, [r0 + x + 3]
+    movh        m0, [r0 + x + 4]
+    punpcklbw   m5, m6
+    punpcklbw   m0, m6
+    pmaddwd     m7, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    pmaddwd     m0, m3
+    packssdw    m7, m4
+    packssdw    m5, m0
+    pshuflw     m4, m7, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m5, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m7, m4
+    paddw       m5, m0
+    psrldq      m7, 2
+    psrldq      m5, 2
+    pshufd      m7, m7, q3120
+    pshufd      m5, m5, q3120
+    punpcklqdq  m7, m5
+    pshuflw     m4, m1, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m7, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m4
+    paddw       m7, m0
+    psrldq      m1, 2
+    psrldq      m7, 2
+    pshufd      m1, m1, q3120
+    pshufd      m7, m7, q3120
+    punpcklqdq  m1, m7
+%endmacro
+
+%macro FILTER_H8_W4_sse2 0
+    movh        m1, [r0 + x - 3]
+    movh        m0, [r0 + x - 2]
+    punpcklbw   m1, m6
+    punpcklbw   m0, m6
+    movh        m4, [r0 + x - 1]
+    movh        m5, [r0 + x]
+    punpcklbw   m4, m6
+    punpcklbw   m5, m6
+    pmaddwd     m1, m3
+    pmaddwd     m0, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    packssdw    m1, m0
+    packssdw    m4, m5
+    pshuflw     m0, m1, q2301
+    pshufhw     m0, m0, q2301
+    pshuflw     m5, m4, q2301
+    pshufhw     m5, m5, q2301
+    paddw       m1, m0
+    paddw       m4, m5
+    psrldq      m1, 2
+    psrldq      m4, 2
+    pshufd      m1, m1, q3120
+    pshufd      m4, m4, q3120
+    punpcklqdq  m1, m4
+    pshuflw     m0, m1, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m0
+    psrldq      m1, 2
+    pshufd      m1, m1, q3120
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
+    mov       r4d, r4m
+    add       r4d, r4d
+    pxor      m6, m6
+
+%ifidn %3, ps
+    add       r3d, r3d
+    cmp       r5m, byte 0
+%endif
+
+%ifdef PIC
+    lea       r5, [tabw_LumaCoeff]
+    movu      m3, [r5 + r4 * 8]
+%else
+    movu      m3, [tabw_LumaCoeff + r4 * 8]
+%endif
+
+    mov       r4d, %2
+
+%ifidn %3, pp
+    mova      m2, [pw_32]
+%else
+    mova      m2, [pw_2000]
+    je        .loopH
+    lea       r5, [r1 + 2 * r1]
+    sub       r0, r5
+    add       r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1 / 8
+    FILTER_H8_W8_sse2
+  %ifidn %3, pp
+    paddw     m1, m2
+    psraw     m1, 6
+    packuswb  m1, m1
+    movh      [r2 + x], m1
+  %else
+    psubw     m1, m2
+    movu      [r2 + 2 * x], m1
+  %endif
+%assign x x+8
+%endrep
+
+%rep (%1 % 8) / 4
+    FILTER_H8_W4_sse2
+  %ifidn %3, pp
+    paddw     m1, m2
+    psraw     m1, 6
+    packuswb  m1, m1
+    movd      [r2 + x], m1
+  %else
+    psubw     m1, m2
+    movh      [r2 + 2 * x], m1
+  %endif
+%endrep
+
+    add       r0, r1
+    add       r2, r3
+
+    dec       r4d
+    jnz       .loopH
+    RET
+
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+    IPFILTER_LUMA_sse2 4, 4, pp
+    IPFILTER_LUMA_sse2 4, 8, pp
+    IPFILTER_LUMA_sse2 8, 4, pp
+    IPFILTER_LUMA_sse2 8, 8, pp
+    IPFILTER_LUMA_sse2 16, 16, pp
+    IPFILTER_LUMA_sse2 16, 8, pp
+    IPFILTER_LUMA_sse2 8, 16, pp
+    IPFILTER_LUMA_sse2 16, 12, pp
+    IPFILTER_LUMA_sse2 12, 16, pp
+    IPFILTER_LUMA_sse2 16, 4, pp
+    IPFILTER_LUMA_sse2 4, 16, pp
+    IPFILTER_LUMA_sse2 32, 32, pp
+    IPFILTER_LUMA_sse2 32, 16, pp
+    IPFILTER_LUMA_sse2 16, 32, pp
+    IPFILTER_LUMA_sse2 32, 24, pp
+    IPFILTER_LUMA_sse2 24, 32, pp
+    IPFILTER_LUMA_sse2 32, 8, pp
+    IPFILTER_LUMA_sse2 8, 32, pp
+    IPFILTER_LUMA_sse2 64, 64, pp
+    IPFILTER_LUMA_sse2 64, 32, pp
+    IPFILTER_LUMA_sse2 32, 64, pp
+    IPFILTER_LUMA_sse2 64, 48, pp
+    IPFILTER_LUMA_sse2 48, 64, pp
+    IPFILTER_LUMA_sse2 64, 16, pp
+    IPFILTER_LUMA_sse2 16, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+    IPFILTER_LUMA_sse2 4, 4, ps
+    IPFILTER_LUMA_sse2 8, 8, ps
+    IPFILTER_LUMA_sse2 8, 4, ps
+    IPFILTER_LUMA_sse2 4, 8, ps
+    IPFILTER_LUMA_sse2 16, 16, ps
+    IPFILTER_LUMA_sse2 16, 8, ps
+    IPFILTER_LUMA_sse2 8, 16, ps
+    IPFILTER_LUMA_sse2 16, 12, ps
+    IPFILTER_LUMA_sse2 12, 16, ps
+    IPFILTER_LUMA_sse2 16, 4, ps
+    IPFILTER_LUMA_sse2 4, 16, ps
+    IPFILTER_LUMA_sse2 32, 32, ps
+    IPFILTER_LUMA_sse2 32, 16, ps
+    IPFILTER_LUMA_sse2 16, 32, ps
+    IPFILTER_LUMA_sse2 32, 24, ps
+    IPFILTER_LUMA_sse2 24, 32, ps
+    IPFILTER_LUMA_sse2 32, 8, ps
+    IPFILTER_LUMA_sse2 8, 32, ps
+    IPFILTER_LUMA_sse2 64, 64, ps
+    IPFILTER_LUMA_sse2 64, 32, ps
+    IPFILTER_LUMA_sse2 32, 64, ps
+    IPFILTER_LUMA_sse2 64, 48, ps
+    IPFILTER_LUMA_sse2 48, 64, ps
+    IPFILTER_LUMA_sse2 64, 16, ps
+    IPFILTER_LUMA_sse2 16, 64, ps
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -3232,6 +3464,78 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 
     RET
 
 ;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM sse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+    mov         r4d,        r4m
+    mov         r5d,        r5m
+    add         r4d,        r4d
+    pxor        m6,         m6
+
+%ifdef PIC
+    lea         r6,         [tabw_LumaCoeff]
+    mova        m3,         [r6 + r4 * 8]
+%else
+    mova        m3,         [tabw_LumaCoeff + r4 * 8]
+%endif
+
+    ; move to row -3
+    lea         r6,         [r1 + r1 * 2]
+    sub         r0,         r6
+
+    mov         r4,         rsp
+
+%assign x 0     ;needed for FILTER_H8_W8_sse2 macro
+%assign y 1
+%rep 15
+    FILTER_H8_W8_sse2
+    psubw       m1,         [pw_2000]
+    mova        [r4],       m1
+
+%if y < 15
+    add         r0,         r1


More information about the x265-commits mailing list