[x265] [PATCH 1 of 2] asm: routines for luma vss filter functions for all block sizes
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Fri Nov 15 13:18:37 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1384517817 -19800
# Fri Nov 15 17:46:57 2013 +0530
# Node ID 351229c80f52d580d24853f64f79e42d47617f87
# Parent b918110fd337178a1cf3616989c65a1e0ed14776
asm: routines for luma vss filter functions for all block sizes.
diff -r b918110fd337 -r 351229c80f52 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Nov 15 17:45:00 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Fri Nov 15 17:46:57 2013 +0530
@@ -5101,3 +5101,147 @@
FILTER_VER_CHROMA_SS_W8_H2 8, 8
FILTER_VER_CHROMA_SS_W8_H2 8, 16
FILTER_VER_CHROMA_SS_W8_H2 8, 32
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SS 2
+INIT_XMM sse2
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+ psrad m0, 6
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+ psrad m1, 6
+
+ packssdw m0, m1
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+ psrad m2, 6
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+ psrad m3, 6
+
+ packssdw m2, m3
+
+ movlps [r2 + 2 * r3], m2
+ lea r5, [r3 + 2 * r3]
+ movhps [r2 + r5], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_LUMA_SS 4, 4
+ FILTER_VER_LUMA_SS 8, 8
+ FILTER_VER_LUMA_SS 8, 4
+ FILTER_VER_LUMA_SS 4, 8
+ FILTER_VER_LUMA_SS 16, 16
+ FILTER_VER_LUMA_SS 16, 8
+ FILTER_VER_LUMA_SS 8, 16
+ FILTER_VER_LUMA_SS 16, 12
+ FILTER_VER_LUMA_SS 12, 16
+ FILTER_VER_LUMA_SS 16, 4
+ FILTER_VER_LUMA_SS 4, 16
+ FILTER_VER_LUMA_SS 32, 32
+ FILTER_VER_LUMA_SS 32, 16
+ FILTER_VER_LUMA_SS 16, 32
+ FILTER_VER_LUMA_SS 32, 24
+ FILTER_VER_LUMA_SS 24, 32
+ FILTER_VER_LUMA_SS 32, 8
+ FILTER_VER_LUMA_SS 8, 32
+ FILTER_VER_LUMA_SS 64, 64
+ FILTER_VER_LUMA_SS 64, 32
+ FILTER_VER_LUMA_SS 32, 64
+ FILTER_VER_LUMA_SS 64, 48
+ FILTER_VER_LUMA_SS 48, 64
+ FILTER_VER_LUMA_SS 64, 16
+ FILTER_VER_LUMA_SS 16, 64
More information about the x265-devel
mailing list