[x265] [PATCH] asm: 10bpp code for vertical luma interpolation filters
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Thu Feb 27 12:48:15 CET 2014
# HG changeset patch
# User Nabajit Deka
# Date 1393501657 -19800
# Thu Feb 27 17:17:37 2014 +0530
# Branch stable
# Node ID 452b23cd9f4d7f5785dbebeb02ecd97ad7192a9d
# Parent 0a6dd816d2e2b5135e4c6479b5b734c318daf1aa
asm: 10bpp code for vertical luma interpolation filters.
diff -r 0a6dd816d2e2 -r 452b23cd9f4d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Feb 27 00:43:21 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Thu Feb 27 17:17:37 2014 +0530
@@ -390,7 +390,10 @@
#if HIGH_BIT_DEPTH // temporary, until all 10bit functions are completed
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
- p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu;
+ p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
+ p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
+ p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
+ p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
#else
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
@@ -866,6 +869,7 @@
CHROMA_VERT_FILTERS(_sse2);
p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
+ p.luma_p2s = x265_luma_p2s_sse2;
p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
@@ -885,6 +889,8 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+
+ LUMA_SS_FILTERS(_sse2);
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 0a6dd816d2e2 -r 452b23cd9f4d source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Thu Feb 27 00:43:21 2014 -0600
+++ b/source/common/x86/ipfilter16.asm Thu Feb 27 17:17:37 2014 +0530
@@ -72,6 +72,26 @@
dw -1, 4, -11, 40, 40, -11, 4, -1
dw 0, 1, -5, 17, 58, -10, 4, -1
+tab_LumaCoeffV: times 4 dw 0, 0
+ times 4 dw 0, 64
+ times 4 dw 0, 0
+ times 4 dw 0, 0
+
+ times 4 dw -1, 4
+ times 4 dw -10, 58
+ times 4 dw 17, -5
+ times 4 dw 1, 0
+
+ times 4 dw -1, 4
+ times 4 dw -11, 40
+ times 4 dw 40, -11
+ times 4 dw 4, -1
+
+ times 4 dw 0, 1
+ times 4 dw -5, 17
+ times 4 dw 58, -10
+ times 4 dw 4, -1
+
SECTION .text
cextern pd_32
@@ -2182,3 +2202,496 @@
jnz .loopH
RET
+
+%macro PROCESS_LUMA_VER_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_PP 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mova m7, [pd_32]
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_LUMA_VER_W4_4R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ pxor m1, m1
+ CLIPW m0, m1, [pw_pixel_max]
+ CLIPW m2, m1, [pw_pixel_max]
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_PP 4, 4
+ FILTER_VER_LUMA_PP 8, 8
+ FILTER_VER_LUMA_PP 8, 4
+ FILTER_VER_LUMA_PP 4, 8
+ FILTER_VER_LUMA_PP 16, 16
+ FILTER_VER_LUMA_PP 16, 8
+ FILTER_VER_LUMA_PP 8, 16
+ FILTER_VER_LUMA_PP 16, 12
+ FILTER_VER_LUMA_PP 12, 16
+ FILTER_VER_LUMA_PP 16, 4
+ FILTER_VER_LUMA_PP 4, 16
+ FILTER_VER_LUMA_PP 32, 32
+ FILTER_VER_LUMA_PP 32, 16
+ FILTER_VER_LUMA_PP 16, 32
+ FILTER_VER_LUMA_PP 32, 24
+ FILTER_VER_LUMA_PP 24, 32
+ FILTER_VER_LUMA_PP 32, 8
+ FILTER_VER_LUMA_PP 8, 32
+ FILTER_VER_LUMA_PP 64, 64
+ FILTER_VER_LUMA_PP 64, 32
+ FILTER_VER_LUMA_PP 32, 64
+ FILTER_VER_LUMA_PP 64, 48
+ FILTER_VER_LUMA_PP 48, 64
+ FILTER_VER_LUMA_PP 64, 16
+ FILTER_VER_LUMA_PP 16, 64
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_PS 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mova m7, [pd_n32768]
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_LUMA_VER_W4_4R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 2
+ psrad m1, 2
+ psrad m2, 2
+ psrad m3, 2
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_PS 4, 4
+ FILTER_VER_LUMA_PS 8, 8
+ FILTER_VER_LUMA_PS 8, 4
+ FILTER_VER_LUMA_PS 4, 8
+ FILTER_VER_LUMA_PS 16, 16
+ FILTER_VER_LUMA_PS 16, 8
+ FILTER_VER_LUMA_PS 8, 16
+ FILTER_VER_LUMA_PS 16, 12
+ FILTER_VER_LUMA_PS 12, 16
+ FILTER_VER_LUMA_PS 16, 4
+ FILTER_VER_LUMA_PS 4, 16
+ FILTER_VER_LUMA_PS 32, 32
+ FILTER_VER_LUMA_PS 32, 16
+ FILTER_VER_LUMA_PS 16, 32
+ FILTER_VER_LUMA_PS 32, 24
+ FILTER_VER_LUMA_PS 24, 32
+ FILTER_VER_LUMA_PS 32, 8
+ FILTER_VER_LUMA_PS 8, 32
+ FILTER_VER_LUMA_PS 64, 64
+ FILTER_VER_LUMA_PS 64, 32
+ FILTER_VER_LUMA_PS 32, 64
+ FILTER_VER_LUMA_PS 64, 48
+ FILTER_VER_LUMA_PS 48, 64
+ FILTER_VER_LUMA_PS 64, 16
+ FILTER_VER_LUMA_PS 16, 64
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SP 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mova m7, [tab_c_524800]
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_LUMA_VER_W4_4R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 10
+ psrad m1, 10
+ psrad m2, 10
+ psrad m3, 10
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ pxor m1, m1
+ CLIPW m0, m1, [pw_pixel_max]
+ CLIPW m2, m1, [pw_pixel_max]
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_SP 4, 4
+ FILTER_VER_LUMA_SP 8, 8
+ FILTER_VER_LUMA_SP 8, 4
+ FILTER_VER_LUMA_SP 4, 8
+ FILTER_VER_LUMA_SP 16, 16
+ FILTER_VER_LUMA_SP 16, 8
+ FILTER_VER_LUMA_SP 8, 16
+ FILTER_VER_LUMA_SP 16, 12
+ FILTER_VER_LUMA_SP 12, 16
+ FILTER_VER_LUMA_SP 16, 4
+ FILTER_VER_LUMA_SP 4, 16
+ FILTER_VER_LUMA_SP 32, 32
+ FILTER_VER_LUMA_SP 32, 16
+ FILTER_VER_LUMA_SP 16, 32
+ FILTER_VER_LUMA_SP 32, 24
+ FILTER_VER_LUMA_SP 24, 32
+ FILTER_VER_LUMA_SP 32, 8
+ FILTER_VER_LUMA_SP 8, 32
+ FILTER_VER_LUMA_SP 64, 64
+ FILTER_VER_LUMA_SP 64, 32
+ FILTER_VER_LUMA_SP 32, 64
+ FILTER_VER_LUMA_SP 64, 48
+ FILTER_VER_LUMA_SP 48, 64
+ FILTER_VER_LUMA_SP 64, 16
+ FILTER_VER_LUMA_SP 16, 64
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SS 2
+INIT_XMM sse2
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mov byte [rsp], %2/4
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_LUMA_VER_W4_4R
+
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ psrad m2, 6
+ psrad m3, 6
+ packssdw m2, m3
+ movlps [r2 + 2 * r3], m2
+ lea r5, [3 * r3]
+ movhps [r2 + r5], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_LUMA_SS 4, 4
+ FILTER_VER_LUMA_SS 8, 8
+ FILTER_VER_LUMA_SS 8, 4
+ FILTER_VER_LUMA_SS 4, 8
+ FILTER_VER_LUMA_SS 16, 16
+ FILTER_VER_LUMA_SS 16, 8
+ FILTER_VER_LUMA_SS 8, 16
+ FILTER_VER_LUMA_SS 16, 12
+ FILTER_VER_LUMA_SS 12, 16
+ FILTER_VER_LUMA_SS 16, 4
+ FILTER_VER_LUMA_SS 4, 16
+ FILTER_VER_LUMA_SS 32, 32
+ FILTER_VER_LUMA_SS 32, 16
+ FILTER_VER_LUMA_SS 16, 32
+ FILTER_VER_LUMA_SS 32, 24
+ FILTER_VER_LUMA_SS 24, 32
+ FILTER_VER_LUMA_SS 32, 8
+ FILTER_VER_LUMA_SS 8, 32
+ FILTER_VER_LUMA_SS 64, 64
+ FILTER_VER_LUMA_SS 64, 32
+ FILTER_VER_LUMA_SS 32, 64
+ FILTER_VER_LUMA_SS 64, 48
+ FILTER_VER_LUMA_SS 48, 64
+ FILTER_VER_LUMA_SS 64, 16
+ FILTER_VER_LUMA_SS 16, 64
+
+;--------------------------------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;--------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal luma_p2s, 3, 7, 5
+
+ add r1, r1
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5 * 2]
+
+ movu m0, [r6]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r6 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r6 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r6, [r6 + r1 * 2]
+ movu m3, [r6 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r5, 8
+ cmp r5, r3
+ jg .width4
+ movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jmp .loopW
+
+.width4:
+ movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+ movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+ movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+ movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r4d, 4
+ jnz .loopH
+
+ RET
diff -r 0a6dd816d2e2 -r 452b23cd9f4d source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Thu Feb 27 00:43:21 2014 -0600
+++ b/source/common/x86/ipfilter8.h Thu Feb 27 17:17:37 2014 +0530
@@ -184,6 +184,7 @@
SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_luma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
CHROMA_VERT_FILTERS(_sse2);
CHROMA_HORIZ_FILTERS(_sse4);
More information about the x265-devel
mailing list