[x265] [PATCH 1 of 3] asm: routines for luma vsp filter functions for all block sizes
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Mon Nov 11 10:47:08 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1384162289 -19800
# Mon Nov 11 15:01:29 2013 +0530
# Node ID 4a1e725c5743d7c137a6885b9cf9710ffe496030
# Parent 7a4e14ca97b4297005053d8aa8cb4da0177691f6
asm: routines for luma vsp filter functions for all block sizes.
diff -r 7a4e14ca97b4 -r 4a1e725c5743 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Nov 11 11:20:09 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Nov 11 15:01:29 2013 +0530
@@ -2736,6 +2736,167 @@
FILTER_VER_LUMA 64, 48, ps
FILTER_VER_LUMA 64, 64, ps
+%macro PROCESS_LUMA_SP_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SP 2
+INIT_XMM ssse3
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+
+ add r1d, r1d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mova m7, [tab_c_526336]
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_LUMA_SP_W4_4R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m0
+ packuswb m2, m2
+
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r3], m0
+ movd [r2 + 2 * r3], m2
+ pshufd m2, m2, 1
+ lea r5, [r3 + 2 * r3]
+ movd [r2 + r5], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_SP 4, 4
+ FILTER_VER_LUMA_SP 8, 8
+ FILTER_VER_LUMA_SP 8, 4
+ FILTER_VER_LUMA_SP 4, 8
+ FILTER_VER_LUMA_SP 16, 16
+ FILTER_VER_LUMA_SP 16, 8
+ FILTER_VER_LUMA_SP 8, 16
+ FILTER_VER_LUMA_SP 16, 12
+ FILTER_VER_LUMA_SP 12, 16
+ FILTER_VER_LUMA_SP 16, 4
+ FILTER_VER_LUMA_SP 4, 16
+ FILTER_VER_LUMA_SP 32, 32
+ FILTER_VER_LUMA_SP 32, 16
+ FILTER_VER_LUMA_SP 16, 32
+ FILTER_VER_LUMA_SP 32, 24
+ FILTER_VER_LUMA_SP 24, 32
+ FILTER_VER_LUMA_SP 32, 8
+ FILTER_VER_LUMA_SP 8, 32
+ FILTER_VER_LUMA_SP 64, 64
+ FILTER_VER_LUMA_SP 64, 32
+ FILTER_VER_LUMA_SP 32, 64
+ FILTER_VER_LUMA_SP 64, 48
+ FILTER_VER_LUMA_SP 48, 64
+ FILTER_VER_LUMA_SP 64, 16
+ FILTER_VER_LUMA_SP 16, 64
+
; TODO: combin of U and V is more performance, but need more register
; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
INIT_XMM ssse3
More information about the x265-devel
mailing list