[x265-commits] [x265] asm: interp_8tap_horiz pp and ps sse2
David T Yuen
dtyx265 at gmail.com
Sat May 2 05:52:44 CEST 2015
details: http://hg.videolan.org/x265/rev/8dd53df88421
branches:
changeset: 10352:8dd53df88421
user: David T Yuen <dtyx265 at gmail.com>
date: Thu Apr 30 05:59:45 2015 -0700
description:
asm: interp_8tap_horiz pp and ps sse2
This replaces c code and covers
4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32, 16x64,
24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64
64-bit
./test/TestBench --testbench interp | grep luma_h
luma_hpp[ 4x4] 1.93x 1785.29 3440.28
luma_hps[ 4x4] 1.85x 4487.96 8320.21
luma_hpp[ 8x8] 2.22x 6005.12 13357.72
luma_hps[ 8x8] 1.93x 10942.82 21135.69
luma_hpp[16x16] 2.55x 23903.86 61070.57
luma_hps[16x16] 2.30x 32845.00 75494.60
luma_hpp[32x32] 2.55x 94419.45 241073.86
luma_hps[32x32] 2.21x 110782.46 244683.78
luma_hpp[64x64] 2.53x 376337.31 951570.12
luma_hps[64x64] 2.30x 402073.75 924435.56
luma_hpp[ 8x4] 2.37x 3028.20 7175.86
luma_hps[ 8x4] 1.92x 8122.82 15575.28
luma_hpp[ 4x8] 1.94x 3562.81 6902.89
luma_hps[ 4x8] 1.85x 6087.62 11235.28
luma_hpp[ 16x8] 2.57x 11910.06 30555.35
luma_hps[ 16x8] 2.27x 21564.17 48875.00
luma_hpp[ 8x16] 2.26x 12018.65 27189.64
luma_hps[ 8x16] 1.96x 16582.61 32435.16
luma_hpp[32x16] 2.55x 47313.87 120654.91
luma_hps[32x16] 2.21x 65361.34 144196.59
luma_hpp[16x32] 2.55x 47465.93 121177.02
luma_hps[16x32] 2.28x 55633.76 126907.55
luma_hpp[64x32] 2.53x 187898.22 475918.91
luma_hps[64x32] 2.30x 221152.92 508105.84
luma_hpp[32x64] 2.55x 188351.70 481050.72
luma_hps[32x64] 2.21x 201352.56 445565.69
luma_hpp[16x12] 2.54x 18025.36 45705.77
luma_hps[16x12] 2.27x 27205.93 61835.00
luma_hpp[12x16] 2.35x 18920.75 44486.00
luma_hps[12x16] 1.88x 25562.62 48125.28
luma_hpp[ 16x4] 2.55x 5973.02 15213.14
luma_hps[ 16x4] 2.27x 15877.92 36108.43
luma_hpp[ 4x16] 2.07x 7217.57 14942.64
luma_hps[ 4x16] 1.87x 9127.50 17075.56
luma_hpp[32x24] 2.54x 70851.95 179641.31
luma_hps[32x24] 2.21x 88052.20 194443.84
luma_hpp[24x32] 2.56x 70742.67 181290.56
luma_hps[24x32] 2.23x 83209.55 185356.34
luma_hpp[ 32x8] 2.54x 23639.81 60057.16
luma_hps[ 32x8] 2.20x 42754.99 94160.41
luma_hpp[ 8x32] 2.44x 23742.78 57819.51
luma_hps[ 8x32] 1.96x 27986.91 54777.17
luma_hpp[64x48] 2.53x 281572.38 712128.56
luma_hps[64x48] 2.29x 312399.41 715907.50
luma_hpp[48x64] 2.53x 281742.69 712628.06
luma_hps[48x64] 2.19x 301655.44 661646.25
luma_hpp[64x16] 2.52x 94103.24 237202.28
luma_hps[64x16] 2.30x 130542.85 299616.22
luma_hpp[16x64] 2.57x 94735.02 243127.64
luma_hps[16x64] 2.29x 100840.45 230957.56
Subject: [x265] asm: interp_8tap_hv_pp_8x8 sse3
details: http://hg.videolan.org/x265/rev/57f8246c759d
branches:
changeset: 10353:57f8246c759d
user: David T Yuen <dtyx265 at gmail.com>
date: Wed Apr 29 19:40:08 2015 -0700
description:
asm: interp_8tap_hv_pp_8x8 sse3
This replaces c code
64-bit
./test/TestBench --testbench interp | grep hv
luma_hv [ 8x8] 2.53x 14225.03 35970.65
32-bit
./test/TestBench --testbench interp | grep hv
luma_hv [ 8x8] 2.50x 14367.40 35917.48
diffstat:
source/common/x86/asm-primitives.cpp | 6 +
source/common/x86/ipfilter8.asm | 304 +++++++++++++++++++++++++++++++++++
source/common/x86/ipfilter8.h | 51 +++++
3 files changed, 361 insertions(+), 0 deletions(-)
diffs (truncated from 405 to 300 lines):
diff -r 94e9c3464c49 -r 57f8246c759d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri May 01 14:56:01 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 29 19:40:08 2015 -0700
@@ -1343,6 +1343,12 @@ void setupAssemblyPrimitives(EncoderPrim
CHROMA_422_VSP_FILTERS(_sse2);
CHROMA_444_VSP_FILTERS(_sse2);
+ ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
+ p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
+ ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
+ p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+ p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
+
//p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
p.frameInitLowres = x265_frame_init_lowres_core_sse2;
diff -r 94e9c3464c49 -r 57f8246c759d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri May 01 14:56:01 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Wed Apr 29 19:40:08 2015 -0700
@@ -151,6 +151,11 @@ const tab_LumaCoeff, db 0, 0, 0, 6
db -1, 4, -11, 40, 40, -11, 4, -1
db 0, 1, -5, 17, 58, -10, 4, -1
+const tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0
+ dw -1, 4, -10, 58, 17, -5, 1, 0
+ dw -1, 4, -11, 40, 40, -11, 4, -1
+ dw 0, 1, -5, 17, 58, -10, 4, -1
+
const tab_LumaCoeffV, times 4 dw 0, 0
times 4 dw 0, 64
times 4 dw 0, 0
@@ -807,6 +812,233 @@ cglobal interp_4tap_horiz_pp_%1x%2, 4, 6
IPFILTER_CHROMA_W_sse3 48, 64
IPFILTER_CHROMA_W_sse3 64, 16
+%macro FILTER_H8_W8_sse2 0
+ movh m1, [r0 + x - 3]
+ movh m4, [r0 + x - 2]
+ punpcklbw m1, m6
+ punpcklbw m4, m6
+ movh m5, [r0 + x - 1]
+ movh m0, [r0 + x]
+ punpcklbw m5, m6
+ punpcklbw m0, m6
+ pmaddwd m1, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ pmaddwd m0, m3
+ packssdw m1, m4
+ packssdw m5, m0
+ pshuflw m4, m1, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m5, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m4
+ paddw m5, m0
+ psrldq m1, 2
+ psrldq m5, 2
+ pshufd m1, m1, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m1, m5
+ movh m7, [r0 + x + 1]
+ movh m4, [r0 + x + 2]
+ punpcklbw m7, m6
+ punpcklbw m4, m6
+ movh m5, [r0 + x + 3]
+ movh m0, [r0 + x + 4]
+ punpcklbw m5, m6
+ punpcklbw m0, m6
+ pmaddwd m7, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ pmaddwd m0, m3
+ packssdw m7, m4
+ packssdw m5, m0
+ pshuflw m4, m7, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m5, q2301
+ pshufhw m0, m0, q2301
+ paddw m7, m4
+ paddw m5, m0
+ psrldq m7, 2
+ psrldq m5, 2
+ pshufd m7, m7, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m7, m5
+ pshuflw m4, m1, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m7, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m4
+ paddw m7, m0
+ psrldq m1, 2
+ psrldq m7, 2
+ pshufd m1, m1, q3120
+ pshufd m7, m7, q3120
+ punpcklqdq m1, m7
+%endmacro
+
+%macro FILTER_H8_W4_sse2 0
+ movh m1, [r0 + x - 3]
+ movh m0, [r0 + x - 2]
+ punpcklbw m1, m6
+ punpcklbw m0, m6
+ movh m4, [r0 + x - 1]
+ movh m5, [r0 + x]
+ punpcklbw m4, m6
+ punpcklbw m5, m6
+ pmaddwd m1, m3
+ pmaddwd m0, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ packssdw m1, m0
+ packssdw m4, m5
+ pshuflw m0, m1, q2301
+ pshufhw m0, m0, q2301
+ pshuflw m5, m4, q2301
+ pshufhw m5, m5, q2301
+ paddw m1, m0
+ paddw m4, m5
+ psrldq m1, 2
+ psrldq m4, 2
+ pshufd m1, m1, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m1, m4
+ pshuflw m0, m1, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m0
+ psrldq m1, 2
+ pshufd m1, m1, q3120
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
+ mov r4d, r4m
+ add r4d, r4d
+ pxor m6, m6
+
+%ifidn %3, ps
+ add r3d, r3d
+ cmp r5m, byte 0
+%endif
+
+%ifdef PIC
+ lea r5, [tabw_LumaCoeff]
+ movu m3, [r5 + r4 * 8]
+%else
+ movu m3, [tabw_LumaCoeff + r4 * 8]
+%endif
+
+ mov r4d, %2
+
+%ifidn %3, pp
+ mova m2, [pw_32]
+%else
+ mova m2, [pw_2000]
+ je .loopH
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ add r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1 / 8
+ FILTER_H8_W8_sse2
+ %ifidn %3, pp
+ paddw m1, m2
+ psraw m1, 6
+ packuswb m1, m1
+ movh [r2 + x], m1
+ %else
+ psubw m1, m2
+ movu [r2 + 2 * x], m1
+ %endif
+%assign x x+8
+%endrep
+
+%rep (%1 % 8) / 4
+ FILTER_H8_W4_sse2
+ %ifidn %3, pp
+ paddw m1, m2
+ psraw m1, 6
+ packuswb m1, m1
+ movd [r2 + x], m1
+ %else
+ psubw m1, m2
+ movh [r2 + 2 * x], m1
+ %endif
+%endrep
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA_sse2 4, 4, pp
+ IPFILTER_LUMA_sse2 4, 8, pp
+ IPFILTER_LUMA_sse2 8, 4, pp
+ IPFILTER_LUMA_sse2 8, 8, pp
+ IPFILTER_LUMA_sse2 16, 16, pp
+ IPFILTER_LUMA_sse2 16, 8, pp
+ IPFILTER_LUMA_sse2 8, 16, pp
+ IPFILTER_LUMA_sse2 16, 12, pp
+ IPFILTER_LUMA_sse2 12, 16, pp
+ IPFILTER_LUMA_sse2 16, 4, pp
+ IPFILTER_LUMA_sse2 4, 16, pp
+ IPFILTER_LUMA_sse2 32, 32, pp
+ IPFILTER_LUMA_sse2 32, 16, pp
+ IPFILTER_LUMA_sse2 16, 32, pp
+ IPFILTER_LUMA_sse2 32, 24, pp
+ IPFILTER_LUMA_sse2 24, 32, pp
+ IPFILTER_LUMA_sse2 32, 8, pp
+ IPFILTER_LUMA_sse2 8, 32, pp
+ IPFILTER_LUMA_sse2 64, 64, pp
+ IPFILTER_LUMA_sse2 64, 32, pp
+ IPFILTER_LUMA_sse2 32, 64, pp
+ IPFILTER_LUMA_sse2 64, 48, pp
+ IPFILTER_LUMA_sse2 48, 64, pp
+ IPFILTER_LUMA_sse2 64, 16, pp
+ IPFILTER_LUMA_sse2 16, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA_sse2 4, 4, ps
+ IPFILTER_LUMA_sse2 8, 8, ps
+ IPFILTER_LUMA_sse2 8, 4, ps
+ IPFILTER_LUMA_sse2 4, 8, ps
+ IPFILTER_LUMA_sse2 16, 16, ps
+ IPFILTER_LUMA_sse2 16, 8, ps
+ IPFILTER_LUMA_sse2 8, 16, ps
+ IPFILTER_LUMA_sse2 16, 12, ps
+ IPFILTER_LUMA_sse2 12, 16, ps
+ IPFILTER_LUMA_sse2 16, 4, ps
+ IPFILTER_LUMA_sse2 4, 16, ps
+ IPFILTER_LUMA_sse2 32, 32, ps
+ IPFILTER_LUMA_sse2 32, 16, ps
+ IPFILTER_LUMA_sse2 16, 32, ps
+ IPFILTER_LUMA_sse2 32, 24, ps
+ IPFILTER_LUMA_sse2 24, 32, ps
+ IPFILTER_LUMA_sse2 32, 8, ps
+ IPFILTER_LUMA_sse2 8, 32, ps
+ IPFILTER_LUMA_sse2 64, 64, ps
+ IPFILTER_LUMA_sse2 64, 32, ps
+ IPFILTER_LUMA_sse2 32, 64, ps
+ IPFILTER_LUMA_sse2 64, 48, ps
+ IPFILTER_LUMA_sse2 48, 64, ps
+ IPFILTER_LUMA_sse2 64, 16, ps
+ IPFILTER_LUMA_sse2 16, 64, ps
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
@@ -3232,6 +3464,78 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8,
RET
;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM sse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+ mov r4d, r4m
+ mov r5d, r5m
+ add r4d, r4d
+ pxor m6, m6
+
+%ifdef PIC
+ lea r6, [tabw_LumaCoeff]
+ mova m3, [r6 + r4 * 8]
+%else
+ mova m3, [tabw_LumaCoeff + r4 * 8]
+%endif
+
+ ; move to row -3
+ lea r6, [r1 + r1 * 2]
+ sub r0, r6
+
+ mov r4, rsp
+
+%assign x 0 ;needed for FILTER_H8_W8_sse2 macro
+%assign y 1
+%rep 15
+ FILTER_H8_W8_sse2
+ psubw m1, [pw_2000]
+ mova [r4], m1
+
+%if y < 15
+ add r0, r1
More information about the x265-commits
mailing list