[x265] [PATCH] asm: interp_8tap_horiz_pX sse2 10-bit

dtyx265 at gmail.com dtyx265 at gmail.com
Wed May 27 22:21:59 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1432758084 25200
# Node ID 5e81b9f2acf59e970adccf2c0c2e23bc76406ea1
# Parent  18939c0e321f08207fa0a383939bc44485773013
asm: interp_8tap_horiz_pX sse2 10-bit

This replaces c code for all of interp_8tap_horiz pp and ps for 10-bit.

64-bit

./test/TestBench --testbench interp | grep luma_hp;bp
luma_hpp[  4x4]		1.89x 	 1810.01  	 3425.68
luma_hps[  4x4]		1.69x 	 5067.49  	 8543.05
luma_hpp[  8x8]		1.95x 	 6772.81  	 13190.10
luma_hps[  8x8]		1.81x 	 12277.58 	 22193.04
luma_hpp[16x16]		2.34x 	 26690.99 	 62385.59
luma_hps[16x16]		2.03x 	 36679.31 	 74382.12
luma_hpp[32x32]		2.31x 	 105869.50 	 244192.05
luma_hps[32x32]		1.98x 	 123096.49 	 243866.27
luma_hpp[64x64]		2.30x 	 422438.12 	 972630.69
luma_hps[64x64]		1.96x 	 446698.34 	 875284.19
luma_hpp[  8x4]		1.95x 	 3393.06  	 6601.71
luma_hps[  8x4]		1.80x 	 9097.55  	 16352.79
luma_hpp[  4x8]		1.83x 	 3715.07  	 6794.98
luma_hps[  4x8]		1.80x 	 6427.61  	 11542.51
luma_hpp[ 16x8]		2.33x 	 13336.42 	 31085.32
luma_hps[ 16x8]		2.03x 	 24128.17 	 48955.20
luma_hpp[ 8x16]		1.95x 	 13555.00 	 26435.95
luma_hps[ 8x16]		1.83x 	 18637.67 	 34194.82
luma_hpp[32x16]		2.30x 	 53091.36 	 122332.84
luma_hps[32x16]		1.99x 	 72525.66 	 144056.80
luma_hpp[16x32]		2.35x 	 52985.05 	 124563.56
luma_hps[16x32]		2.02x 	 62106.79 	 125750.16
luma_hpp[64x32]		2.29x 	 211133.45 	 484107.31
luma_hps[64x32]		1.96x 	 245171.02 	 480323.22
luma_hpp[32x64]		2.32x 	 211742.39 	 491633.34
luma_hps[32x64]		1.98x 	 224430.12 	 443457.41
luma_hpp[16x12]		2.32x 	 20139.19 	 46754.22
luma_hps[16x12]		2.03x 	 30357.20 	 61526.66
luma_hpp[12x16]		1.93x 	 20543.72 	 39604.34
luma_hps[12x16]		1.80x 	 28044.56 	 50457.48
luma_hpp[ 16x4]		2.24x 	 6683.08  	 14967.70
luma_hps[ 16x4]		2.01x 	 17668.30 	 35595.00
luma_hpp[ 4x16]		1.89x 	 7095.19  	 13389.99
luma_hps[ 4x16]		1.73x 	 10167.50 	 17543.30
luma_hpp[32x24]		2.30x 	 79480.97 	 183180.16
luma_hps[32x24]		1.98x 	 98020.54 	 193752.25
luma_hpp[24x32]		2.33x 	 79450.27 	 184976.78
luma_hps[24x32]		2.00x 	 92464.59 	 185285.14
luma_hpp[ 32x8]		2.31x 	 26498.50 	 61131.69
luma_hps[ 32x8]		1.98x 	 47509.77 	 93843.30
luma_hpp[ 8x32]		1.93x 	 27285.71 	 52692.59
luma_hps[ 8x32]		1.79x 	 31998.99 	 57235.87
luma_hpp[64x48]		2.30x 	 316549.00 	 729001.88
luma_hps[64x48]		1.96x 	 346096.66 	 679268.88
luma_hpp[48x64]		2.31x 	 317071.12 	 732797.31
luma_hps[48x64]		1.97x 	 336062.44 	 660764.25
luma_hpp[64x16]		2.28x 	 105878.89 	 241366.52
luma_hps[64x16]		1.96x 	 144960.28 	 283637.09
luma_hpp[16x64]		2.35x 	 106207.98 	 250095.09
luma_hps[16x64]		2.03x 	 113082.70 	 229174.33

diff -r 18939c0e321f -r 5e81b9f2acf5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue May 26 13:17:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed May 27 13:21:24 2015 -0700
@@ -857,6 +857,11 @@
         CHROMA_422_VERT_FILTERS(_sse2);
         CHROMA_444_VERT_FILTERS(sse2);
 
+        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
+        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
+        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
+        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
         PIXEL_AVG(sse2);
diff -r 18939c0e321f -r 5e81b9f2acf5 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue May 26 13:17:55 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed May 27 13:21:24 2015 -0700
@@ -127,6 +127,247 @@
 cextern pd_n32768
 cextern pw_2000
 
+%macro FILTER_LUMA_HOR_4_sse2 1
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
+    pmaddwd     m4,     m0
+    pmaddwd     m5,     m0
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
+    pmaddwd     m5,     m0
+    pmaddwd     m3,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m3,     m3,     q3120
+    punpcklqdq  m5,     m3
+
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+    paddd       m4,     m1
+%endmacro
+
+%macro FILTER_LUMA_HOR_8_sse2 1
+    movu        m4,     [r0 + %1]       ; m4 = src[0-7]
+    movu        m5,     [r0 + %1 + 2]   ; m5 = src[1-8]
+    pmaddwd     m4,     m0
+    pmaddwd     m5,     m0
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+
+    movu        m5,     [r0 + %1 + 4]   ; m5 = src[2-9]
+    movu        m3,     [r0 + %1 + 6]   ; m3 = src[3-10]
+    pmaddwd     m5,     m0
+    pmaddwd     m3,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m3,     m3,     q3120
+    punpcklqdq  m5,     m3
+
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m4,     m4,     q3120
+    pshufd      m5,     m5,     q3120
+    punpcklqdq  m4,     m5
+    paddd       m4,     m1
+
+    movu        m5,     [r0 + %1 + 8]   ; m5 = src[4-11]
+    movu        m6,     [r0 + %1 + 10]  ; m6 = src[5-12]
+    pmaddwd     m5,     m0
+    pmaddwd     m6,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m6,     q2301
+    paddd       m6,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m6,     m6,     q3120
+    punpcklqdq  m5,     m6
+
+    movu        m6,     [r0 + %1 + 12]  ; m6 = src[6-13]
+    movu        m3,     [r0 + %1 + 14]  ; m3 = src[7-14]
+    pmaddwd     m6,     m0
+    pmaddwd     m3,     m0
+    pshufd      m2,     m6,     q2301
+    paddd       m6,     m2
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m6,     m6,     q3120
+    pshufd      m3,     m3,     q3120
+    punpcklqdq  m6,     m3
+
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m6,     q2301
+    paddd       m6,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m6,     m6,     q3120
+    punpcklqdq  m5,     m6
+    paddd       m5,     m1
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_p%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    mov         r4d,    r4m
+    sub         r0,     6
+    shl         r4d,    4
+    add         r1d,    r1d
+    add         r3d,    r3d
+
+%ifdef PIC
+    lea         r6,     [tab_LumaCoeff]
+    mova        m0,     [r6 + r4]
+%else
+    mova        m0,     [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp
+    mova        m1,     [pd_32]
+    pxor        m7,     m7
+%else
+    mova        m1,     [pd_n32768]
+%endif
+
+    mov         r4d,    %2
+%ifidn %3, ps
+    cmp         r5m,    byte 0
+    je          .loopH
+    lea         r6,     [r1 + 2 * r1]
+    sub         r0,     r6
+    add         r4d,    7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1/8
+    FILTER_LUMA_HOR_8_sse2 x
+
+%ifidn %3, pp
+    psrad       m4,     6
+    psrad       m5,     6
+    packssdw    m4,     m5
+    CLIPW       m4,     m7,     [pw_pixel_max]
+%else
+    psrad       m4,     2
+    psrad       m5,     2
+    packssdw    m4,     m5
+%endif
+
+    movu        [r2 + x], m4
+%assign x x+16
+%endrep
+
+%rep (%1 % 8)/4
+    FILTER_LUMA_HOR_4_sse2 x
+
+%ifidn %3, pp
+    psrad       m4,     6
+    packssdw    m4,     m4
+    CLIPW       m4,     m7,     [pw_pixel_max]
+%else
+    psrad       m4,     2
+    packssdw    m4,     m4
+%endif
+
+    movh        [r2 + x], m4
+%endrep
+
+    add         r0,     r1
+    add         r2,     r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;------------------------------------------------------------------------------------------------------------
+    FILTER_HOR_LUMA_sse2 4, 4, pp
+    FILTER_HOR_LUMA_sse2 4, 8, pp
+    FILTER_HOR_LUMA_sse2 4, 16, pp
+    FILTER_HOR_LUMA_sse2 8, 4, pp
+    FILTER_HOR_LUMA_sse2 8, 8, pp
+    FILTER_HOR_LUMA_sse2 8, 16, pp
+    FILTER_HOR_LUMA_sse2 8, 32, pp
+    FILTER_HOR_LUMA_sse2 12, 16, pp
+    FILTER_HOR_LUMA_sse2 16, 4, pp
+    FILTER_HOR_LUMA_sse2 16, 8, pp
+    FILTER_HOR_LUMA_sse2 16, 12, pp
+    FILTER_HOR_LUMA_sse2 16, 16, pp
+    FILTER_HOR_LUMA_sse2 16, 32, pp
+    FILTER_HOR_LUMA_sse2 16, 64, pp
+    FILTER_HOR_LUMA_sse2 24, 32, pp
+    FILTER_HOR_LUMA_sse2 32, 8, pp
+    FILTER_HOR_LUMA_sse2 32, 16, pp
+    FILTER_HOR_LUMA_sse2 32, 24, pp
+    FILTER_HOR_LUMA_sse2 32, 32, pp
+    FILTER_HOR_LUMA_sse2 32, 64, pp
+    FILTER_HOR_LUMA_sse2 48, 64, pp
+    FILTER_HOR_LUMA_sse2 64, 16, pp
+    FILTER_HOR_LUMA_sse2 64, 32, pp
+    FILTER_HOR_LUMA_sse2 64, 48, pp
+    FILTER_HOR_LUMA_sse2 64, 64, pp
+
+;---------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;---------------------------------------------------------------------------------------------------------------------------
+    FILTER_HOR_LUMA_sse2 4, 4, ps
+    FILTER_HOR_LUMA_sse2 4, 8, ps
+    FILTER_HOR_LUMA_sse2 4, 16, ps
+    FILTER_HOR_LUMA_sse2 8, 4, ps
+    FILTER_HOR_LUMA_sse2 8, 8, ps
+    FILTER_HOR_LUMA_sse2 8, 16, ps
+    FILTER_HOR_LUMA_sse2 8, 32, ps
+    FILTER_HOR_LUMA_sse2 12, 16, ps
+    FILTER_HOR_LUMA_sse2 16, 4, ps
+    FILTER_HOR_LUMA_sse2 16, 8, ps
+    FILTER_HOR_LUMA_sse2 16, 12, ps
+    FILTER_HOR_LUMA_sse2 16, 16, ps
+    FILTER_HOR_LUMA_sse2 16, 32, ps
+    FILTER_HOR_LUMA_sse2 16, 64, ps
+    FILTER_HOR_LUMA_sse2 24, 32, ps
+    FILTER_HOR_LUMA_sse2 32, 8, ps
+    FILTER_HOR_LUMA_sse2 32, 16, ps
+    FILTER_HOR_LUMA_sse2 32, 24, ps
+    FILTER_HOR_LUMA_sse2 32, 32, ps
+    FILTER_HOR_LUMA_sse2 32, 64, ps
+    FILTER_HOR_LUMA_sse2 48, 64, ps
+    FILTER_HOR_LUMA_sse2 64, 16, ps
+    FILTER_HOR_LUMA_sse2 64, 32, ps
+    FILTER_HOR_LUMA_sse2 64, 48, ps
+    FILTER_HOR_LUMA_sse2 64, 64, ps
+
 ;------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list