[x265] [PATCH] asm: interp_8tap_horiz_pX sse2 10-bit
dtyx265 at gmail.com
dtyx265 at gmail.com
Wed May 27 22:21:59 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1432758084 25200
# Node ID 5e81b9f2acf59e970adccf2c0c2e23bc76406ea1
# Parent 18939c0e321f08207fa0a383939bc44485773013
asm: interp_8tap_horiz_pX sse2 10-bit
This replaces c code for all of interp_8tap_horiz pp and ps for 10-bit.
64-bit
./test/TestBench --testbench interp | grep luma_hp;bp
luma_hpp[ 4x4] 1.89x 1810.01 3425.68
luma_hps[ 4x4] 1.69x 5067.49 8543.05
luma_hpp[ 8x8] 1.95x 6772.81 13190.10
luma_hps[ 8x8] 1.81x 12277.58 22193.04
luma_hpp[16x16] 2.34x 26690.99 62385.59
luma_hps[16x16] 2.03x 36679.31 74382.12
luma_hpp[32x32] 2.31x 105869.50 244192.05
luma_hps[32x32] 1.98x 123096.49 243866.27
luma_hpp[64x64] 2.30x 422438.12 972630.69
luma_hps[64x64] 1.96x 446698.34 875284.19
luma_hpp[ 8x4] 1.95x 3393.06 6601.71
luma_hps[ 8x4] 1.80x 9097.55 16352.79
luma_hpp[ 4x8] 1.83x 3715.07 6794.98
luma_hps[ 4x8] 1.80x 6427.61 11542.51
luma_hpp[ 16x8] 2.33x 13336.42 31085.32
luma_hps[ 16x8] 2.03x 24128.17 48955.20
luma_hpp[ 8x16] 1.95x 13555.00 26435.95
luma_hps[ 8x16] 1.83x 18637.67 34194.82
luma_hpp[32x16] 2.30x 53091.36 122332.84
luma_hps[32x16] 1.99x 72525.66 144056.80
luma_hpp[16x32] 2.35x 52985.05 124563.56
luma_hps[16x32] 2.02x 62106.79 125750.16
luma_hpp[64x32] 2.29x 211133.45 484107.31
luma_hps[64x32] 1.96x 245171.02 480323.22
luma_hpp[32x64] 2.32x 211742.39 491633.34
luma_hps[32x64] 1.98x 224430.12 443457.41
luma_hpp[16x12] 2.32x 20139.19 46754.22
luma_hps[16x12] 2.03x 30357.20 61526.66
luma_hpp[12x16] 1.93x 20543.72 39604.34
luma_hps[12x16] 1.80x 28044.56 50457.48
luma_hpp[ 16x4] 2.24x 6683.08 14967.70
luma_hps[ 16x4] 2.01x 17668.30 35595.00
luma_hpp[ 4x16] 1.89x 7095.19 13389.99
luma_hps[ 4x16] 1.73x 10167.50 17543.30
luma_hpp[32x24] 2.30x 79480.97 183180.16
luma_hps[32x24] 1.98x 98020.54 193752.25
luma_hpp[24x32] 2.33x 79450.27 184976.78
luma_hps[24x32] 2.00x 92464.59 185285.14
luma_hpp[ 32x8] 2.31x 26498.50 61131.69
luma_hps[ 32x8] 1.98x 47509.77 93843.30
luma_hpp[ 8x32] 1.93x 27285.71 52692.59
luma_hps[ 8x32] 1.79x 31998.99 57235.87
luma_hpp[64x48] 2.30x 316549.00 729001.88
luma_hps[64x48] 1.96x 346096.66 679268.88
luma_hpp[48x64] 2.31x 317071.12 732797.31
luma_hps[48x64] 1.97x 336062.44 660764.25
luma_hpp[64x16] 2.28x 105878.89 241366.52
luma_hps[64x16] 1.96x 144960.28 283637.09
luma_hpp[16x64] 2.35x 106207.98 250095.09
luma_hps[16x64] 2.03x 113082.70 229174.33
diff -r 18939c0e321f -r 5e81b9f2acf5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 26 13:17:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed May 27 13:21:24 2015 -0700
@@ -857,6 +857,11 @@
CHROMA_422_VERT_FILTERS(_sse2);
CHROMA_444_VERT_FILTERS(sse2);
+ ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
+ p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
+ ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
+ p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
PIXEL_AVG(sse2);
diff -r 18939c0e321f -r 5e81b9f2acf5 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue May 26 13:17:55 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Wed May 27 13:21:24 2015 -0700
@@ -127,6 +127,247 @@
cextern pd_n32768
cextern pw_2000
+%macro FILTER_LUMA_HOR_4_sse2 1
+ movu m4, [r0 + %1] ; m4 = src[0-7]
+ movu m5, [r0 + %1 + 2] ; m5 = src[1-8]
+ pmaddwd m4, m0
+ pmaddwd m5, m0
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m4, m4, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m4, m5
+
+ movu m5, [r0 + %1 + 4] ; m5 = src[2-9]
+ movu m3, [r0 + %1 + 6] ; m3 = src[3-10]
+ pmaddwd m5, m0
+ pmaddwd m3, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m5, m5, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m5, m3
+
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m4, m4, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m4, m5
+ paddd m4, m1
+%endmacro
+
+%macro FILTER_LUMA_HOR_8_sse2 1
+ movu m4, [r0 + %1] ; m4 = src[0-7]
+ movu m5, [r0 + %1 + 2] ; m5 = src[1-8]
+ pmaddwd m4, m0
+ pmaddwd m5, m0
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m4, m4, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m4, m5
+
+ movu m5, [r0 + %1 + 4] ; m5 = src[2-9]
+ movu m3, [r0 + %1 + 6] ; m3 = src[3-10]
+ pmaddwd m5, m0
+ pmaddwd m3, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m5, m5, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m5, m3
+
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m4, m4, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m4, m5
+ paddd m4, m1
+
+ movu m5, [r0 + %1 + 8] ; m5 = src[4-11]
+ movu m6, [r0 + %1 + 10] ; m6 = src[5-12]
+ pmaddwd m5, m0
+ pmaddwd m6, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m6, q2301
+ paddd m6, m2
+ pshufd m5, m5, q3120
+ pshufd m6, m6, q3120
+ punpcklqdq m5, m6
+
+ movu m6, [r0 + %1 + 12] ; m6 = src[6-13]
+ movu m3, [r0 + %1 + 14] ; m3 = src[7-14]
+ pmaddwd m6, m0
+ pmaddwd m3, m0
+ pshufd m2, m6, q2301
+ paddd m6, m2
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m6, m6, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m6, m3
+
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m6, q2301
+ paddd m6, m2
+ pshufd m5, m5, q3120
+ pshufd m6, m6, q3120
+ punpcklqdq m5, m6
+ paddd m5, m1
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_p%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+ mov r4d, r4m
+ sub r0, 6
+ shl r4d, 4
+ add r1d, r1d
+ add r3d, r3d
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ mova m0, [r6 + r4]
+%else
+ mova m0, [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp
+ mova m1, [pd_32]
+ pxor m7, m7
+%else
+ mova m1, [pd_n32768]
+%endif
+
+ mov r4d, %2
+%ifidn %3, ps
+ cmp r5m, byte 0
+ je .loopH
+ lea r6, [r1 + 2 * r1]
+ sub r0, r6
+ add r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1/8
+ FILTER_LUMA_HOR_8_sse2 x
+
+%ifidn %3, pp
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ CLIPW m4, m7, [pw_pixel_max]
+%else
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m4, m5
+%endif
+
+ movu [r2 + x], m4
+%assign x x+16
+%endrep
+
+%rep (%1 % 8)/4
+ FILTER_LUMA_HOR_4_sse2 x
+
+%ifidn %3, pp
+ psrad m4, 6
+ packssdw m4, m4
+ CLIPW m4, m7, [pw_pixel_max]
+%else
+ psrad m4, 2
+ packssdw m4, m4
+%endif
+
+ movh [r2 + x], m4
+%endrep
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;------------------------------------------------------------------------------------------------------------
+ FILTER_HOR_LUMA_sse2 4, 4, pp
+ FILTER_HOR_LUMA_sse2 4, 8, pp
+ FILTER_HOR_LUMA_sse2 4, 16, pp
+ FILTER_HOR_LUMA_sse2 8, 4, pp
+ FILTER_HOR_LUMA_sse2 8, 8, pp
+ FILTER_HOR_LUMA_sse2 8, 16, pp
+ FILTER_HOR_LUMA_sse2 8, 32, pp
+ FILTER_HOR_LUMA_sse2 12, 16, pp
+ FILTER_HOR_LUMA_sse2 16, 4, pp
+ FILTER_HOR_LUMA_sse2 16, 8, pp
+ FILTER_HOR_LUMA_sse2 16, 12, pp
+ FILTER_HOR_LUMA_sse2 16, 16, pp
+ FILTER_HOR_LUMA_sse2 16, 32, pp
+ FILTER_HOR_LUMA_sse2 16, 64, pp
+ FILTER_HOR_LUMA_sse2 24, 32, pp
+ FILTER_HOR_LUMA_sse2 32, 8, pp
+ FILTER_HOR_LUMA_sse2 32, 16, pp
+ FILTER_HOR_LUMA_sse2 32, 24, pp
+ FILTER_HOR_LUMA_sse2 32, 32, pp
+ FILTER_HOR_LUMA_sse2 32, 64, pp
+ FILTER_HOR_LUMA_sse2 48, 64, pp
+ FILTER_HOR_LUMA_sse2 64, 16, pp
+ FILTER_HOR_LUMA_sse2 64, 32, pp
+ FILTER_HOR_LUMA_sse2 64, 48, pp
+ FILTER_HOR_LUMA_sse2 64, 64, pp
+
+;---------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;---------------------------------------------------------------------------------------------------------------------------
+ FILTER_HOR_LUMA_sse2 4, 4, ps
+ FILTER_HOR_LUMA_sse2 4, 8, ps
+ FILTER_HOR_LUMA_sse2 4, 16, ps
+ FILTER_HOR_LUMA_sse2 8, 4, ps
+ FILTER_HOR_LUMA_sse2 8, 8, ps
+ FILTER_HOR_LUMA_sse2 8, 16, ps
+ FILTER_HOR_LUMA_sse2 8, 32, ps
+ FILTER_HOR_LUMA_sse2 12, 16, ps
+ FILTER_HOR_LUMA_sse2 16, 4, ps
+ FILTER_HOR_LUMA_sse2 16, 8, ps
+ FILTER_HOR_LUMA_sse2 16, 12, ps
+ FILTER_HOR_LUMA_sse2 16, 16, ps
+ FILTER_HOR_LUMA_sse2 16, 32, ps
+ FILTER_HOR_LUMA_sse2 16, 64, ps
+ FILTER_HOR_LUMA_sse2 24, 32, ps
+ FILTER_HOR_LUMA_sse2 32, 8, ps
+ FILTER_HOR_LUMA_sse2 32, 16, ps
+ FILTER_HOR_LUMA_sse2 32, 24, ps
+ FILTER_HOR_LUMA_sse2 32, 32, ps
+ FILTER_HOR_LUMA_sse2 32, 64, ps
+ FILTER_HOR_LUMA_sse2 48, 64, ps
+ FILTER_HOR_LUMA_sse2 64, 16, ps
+ FILTER_HOR_LUMA_sse2 64, 32, ps
+ FILTER_HOR_LUMA_sse2 64, 48, ps
+ FILTER_HOR_LUMA_sse2 64, 64, ps
+
;------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list