[x265] [PATCH] asm: interp_8tap_vert_pX sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Fri May 29 19:01:15 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1432917446 25200
# Node ID 2d5efe979f6b9c8db275ecb53767e4bcff1da659
# Parent 12f0ed28ba0eb29f2df0bb8adbc5f3cfb40a6361
asm: interp_8tap_vert_pX sse2
This code replaces c code for sse2. It is the combination of the sse4 macros into
one for smaller code size with no sacrifice in function and a few tweeks for performance.
The original sse4 macros only use up to sse2 code so this code may perform better with the
tweeks which include unrolling the inner loop which eliminated the need to use the stack
to hold the counter for one of the loops and replaced incrementing the source register
with address offsets.
diff -r 12f0ed28ba0e -r 2d5efe979f6b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri May 29 10:46:29 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri May 29 09:37:26 2015 -0700
@@ -861,6 +861,8 @@
p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
diff -r 12f0ed28ba0e -r 2d5efe979f6b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Fri May 29 10:46:29 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Fri May 29 09:37:26 2015 -0700
@@ -369,6 +369,201 @@
FILTER_HOR_LUMA_sse2 64, 48, ps
FILTER_HOR_LUMA_sse2 64, 64, ps
+%macro PROCESS_LUMA_VER_W4_4R_sse2 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%1_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_vert_%1_%2x%3, 5, 7, 8
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+%ifidn %1,pp
+ mova m7, [pd_32]
+%define SHIFT 6
+%elifidn %1,ps
+ mova m7, [pd_n32768]
+%define SHIFT 2
+%endif
+
+ mov r4d, %3/4
+.loopH:
+%assign x 0
+%rep %2/4
+ PROCESS_LUMA_VER_W4_4R_sse2
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, SHIFT
+ psrad m1, SHIFT
+ psrad m2, SHIFT
+ psrad m3, SHIFT
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+%ifidn %1,pp
+ pxor m1, m1
+ CLIPW2 m0, m2, m1, [pw_pixel_max]
+%endif
+
+ movh [r2 + x], m0
+ movhps [r2 + r3 + x], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5 + x], m2
+ movhps [r5 + r3 + x], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+%assign x x+8
+%endrep
+
+ lea r0, [r0 + 4 * r1 - 2 * %2]
+ lea r2, [r2 + 4 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_sse2 pp, 4, 4
+ FILTER_VER_LUMA_sse2 pp, 8, 8
+ FILTER_VER_LUMA_sse2 pp, 8, 4
+ FILTER_VER_LUMA_sse2 pp, 4, 8
+ FILTER_VER_LUMA_sse2 pp, 16, 16
+ FILTER_VER_LUMA_sse2 pp, 16, 8
+ FILTER_VER_LUMA_sse2 pp, 8, 16
+ FILTER_VER_LUMA_sse2 pp, 16, 12
+ FILTER_VER_LUMA_sse2 pp, 12, 16
+ FILTER_VER_LUMA_sse2 pp, 16, 4
+ FILTER_VER_LUMA_sse2 pp, 4, 16
+ FILTER_VER_LUMA_sse2 pp, 32, 32
+ FILTER_VER_LUMA_sse2 pp, 32, 16
+ FILTER_VER_LUMA_sse2 pp, 16, 32
+ FILTER_VER_LUMA_sse2 pp, 32, 24
+ FILTER_VER_LUMA_sse2 pp, 24, 32
+ FILTER_VER_LUMA_sse2 pp, 32, 8
+ FILTER_VER_LUMA_sse2 pp, 8, 32
+ FILTER_VER_LUMA_sse2 pp, 64, 64
+ FILTER_VER_LUMA_sse2 pp, 64, 32
+ FILTER_VER_LUMA_sse2 pp, 32, 64
+ FILTER_VER_LUMA_sse2 pp, 64, 48
+ FILTER_VER_LUMA_sse2 pp, 48, 64
+ FILTER_VER_LUMA_sse2 pp, 64, 16
+ FILTER_VER_LUMA_sse2 pp, 16, 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_sse2 ps, 4, 4
+ FILTER_VER_LUMA_sse2 ps, 8, 8
+ FILTER_VER_LUMA_sse2 ps, 8, 4
+ FILTER_VER_LUMA_sse2 ps, 4, 8
+ FILTER_VER_LUMA_sse2 ps, 16, 16
+ FILTER_VER_LUMA_sse2 ps, 16, 8
+ FILTER_VER_LUMA_sse2 ps, 8, 16
+ FILTER_VER_LUMA_sse2 ps, 16, 12
+ FILTER_VER_LUMA_sse2 ps, 12, 16
+ FILTER_VER_LUMA_sse2 ps, 16, 4
+ FILTER_VER_LUMA_sse2 ps, 4, 16
+ FILTER_VER_LUMA_sse2 ps, 32, 32
+ FILTER_VER_LUMA_sse2 ps, 32, 16
+ FILTER_VER_LUMA_sse2 ps, 16, 32
+ FILTER_VER_LUMA_sse2 ps, 32, 24
+ FILTER_VER_LUMA_sse2 ps, 24, 32
+ FILTER_VER_LUMA_sse2 ps, 32, 8
+ FILTER_VER_LUMA_sse2 ps, 8, 32
+ FILTER_VER_LUMA_sse2 ps, 64, 64
+ FILTER_VER_LUMA_sse2 ps, 64, 32
+ FILTER_VER_LUMA_sse2 ps, 32, 64
+ FILTER_VER_LUMA_sse2 ps, 64, 48
+ FILTER_VER_LUMA_sse2 ps, 48, 64
+ FILTER_VER_LUMA_sse2 ps, 64, 16
+ FILTER_VER_LUMA_sse2 ps, 16, 64
+
;------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list