[x265] [PATCH 1 of 3] asm: routines for chroma vsp filter functions for all block sizes
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Tue Nov 12 11:58:27 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1384253490 -19800
# Tue Nov 12 16:21:30 2013 +0530
# Node ID da706d553c882eff32b53969a425e69a17976c2e
# Parent fd23a50d6336fc3fef6466c9a8f1baa0e3a2228b
asm: routines for chroma vsp filter functions for all block sizes.
diff -r fd23a50d6336 -r da706d553c88 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Nov 12 16:16:14 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Nov 12 16:21:30 2013 +0530
@@ -53,6 +53,30 @@
db -2, 16, 54, -4
db -2, 10, 58, -2
+tab_ChromaCoeffV: times 4 dw 0, 64
+ times 4 dw 0, 0
+
+ times 4 dw -2, 58
+ times 4 dw 10, -2
+
+ times 4 dw -4, 54
+ times 4 dw 16, -2
+
+ times 4 dw -6, 46
+ times 4 dw 28, -4
+
+ times 4 dw -4, 36
+ times 4 dw 36, -4
+
+ times 4 dw -4, 28
+ times 4 dw 46, -6
+
+ times 4 dw -2, 16
+ times 4 dw 54, -4
+
+ times 4 dw -2, 10
+ times 4 dw 58, -2
+
tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
db -1, 4, -10, 58, 17, -5, 1, 0
db -1, 4, -11, 40, 40, -11, 4, -1
@@ -2973,3 +2997,423 @@
jnz .loopH
RET
+
+%macro PROCESS_CHROMA_SP_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1 done
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5] Row3
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6] Row4
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP 2
+INIT_XMM ssse3
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-1
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m6, [tab_c_526336]
+
+ mov byte [rsp], %2/4
+
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_CHROMA_SP_W4_4R
+
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m0
+ packuswb m2, m2
+
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r3], m0
+ movd [r2 + 2 * r3], m2
+ pshufd m2, m2, 1
+ lea r5, [r3 + 2 * r3]
+ movd [r2 + r5], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_CHROMA_SP 4, 4
+ FILTER_VER_CHROMA_SP 4, 8
+ FILTER_VER_CHROMA_SP 16, 16
+ FILTER_VER_CHROMA_SP 16, 8
+ FILTER_VER_CHROMA_SP 16, 12
+ FILTER_VER_CHROMA_SP 12, 16
+ FILTER_VER_CHROMA_SP 16, 4
+ FILTER_VER_CHROMA_SP 4, 16
+ FILTER_VER_CHROMA_SP 32, 32
+ FILTER_VER_CHROMA_SP 32, 16
+ FILTER_VER_CHROMA_SP 16, 32
+ FILTER_VER_CHROMA_SP 32, 24
+ FILTER_VER_CHROMA_SP 24, 32
+ FILTER_VER_CHROMA_SP 32, 8
+
+
+%macro PROCESS_CHROMA_SP_W2_4R 0
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+
+ movd m2, [r0 + 2 * r1]
+ punpcklwd m1, m2 ;m1=[1 2]
+ punpcklqdq m0, m1 ;m0=[0 1 1 2]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1 1+2] Row 1-2
+
+ lea r0, [r0 + 2 * r1]
+ movd m1, [r0 + r1]
+ punpcklwd m2, m1 ;m2=[2 3]
+
+ movd m3, [r0 + 2 * r1]
+ punpcklwd m1, m3 ;m2=[3 4]
+ punpcklqdq m2, m1 ;m2=[2 3 3 4]
+
+ pmaddwd m4, m2, [r6 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
+ pmaddwd m2, [r6 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
+ paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
+
+ lea r0, [r0 + 2 * r1]
+ movd m1, [r0 + r1]
+ punpcklwd m3, m1 ;m3=[4 5]
+
+ movd m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[5 6]
+ punpcklqdq m3, m1 ;m2=[4 5 5 6]
+ pmaddwd m3, [r6 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
+ paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W2_4R 2
+INIT_XMM ssse3
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 6
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m5, [tab_c_526336]
+
+ mov r4d, (%2/4)
+
+.loopH
+ PROCESS_CHROMA_SP_W2_4R
+
+ paddd m0, m5
+ paddd m2, m5
+
+ psrad m0, 12
+ psrad m2, 12
+
+ packssdw m0, m2
+ packuswb m0, m0
+
+ pextrw [r2], m0, 0
+ pextrw [r2 + r3], m0, 1
+ pextrw [r2 + 2 * r3], m0, 2
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2 + r3], m0, 3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W2_4R 2, 4
+FILTER_VER_CHROMA_SP_W2_4R 2, 8
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m4, [tab_c_526336]
+
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
+
+ movq m2, [r0 + 2 * r1]
+ punpcklwd m1, m2 ;m1=[1 2]
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
+
+ lea r0, [r0 + 2 * r1]
+ movq m3, [r0 + r1]
+ punpcklwd m2, m3 ;m4=[2 3]
+ pmaddwd m2, [r5 + 1 * 16]
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done
+ paddd m0, m4
+ psrad m0, 12
+
+ movq m2, [r0 + 2 * r1]
+ punpcklwd m3, m2 ;m5=[3 4]
+ pmaddwd m3, [r5 + 1 * 16]
+ paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
+ paddd m1, m4
+ psrad m1, 12
+
+ packssdw m0, m1
+ packuswb m0, m0
+
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r3], m0
+
+ RET
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_sp_6x8(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_4tap_vert_sp_6x8, 5, 7, 7
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m6, [tab_c_526336]
+
+ mov r4d, 8/4
+
+.loopH
+ PROCESS_CHROMA_SP_W4_4R
+
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m0
+ packuswb m2, m2
+
+ movd [r2], m0
+ pshufd m0, m0, 1
+ movd [r2 + r3], m0
+ movd [r2 + 2 * r3], m2
+ pshufd m2, m2, 1
+ lea r5, [r3 + 2 * r3]
+ movd [r2 + r5], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ PROCESS_CHROMA_SP_W2_4R
+
+ paddd m0, m6
+ paddd m2, m6
+
+ psrad m0, 12
+ psrad m2, 12
+
+ packssdw m0, m2
+ packuswb m0, m0
+
+ pextrw [r2], m0, 0
+ pextrw [r2 + r3], m0, 1
+ pextrw [r2 + 2 * r3], m0, 2
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2 + r3], m0, 3
+
+ sub r0, 2 * 4
+ lea r2, [r2 + 2 * r3 - 4]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+
+%macro PROCESS_CHROMA_SP_W8_2R 0
+ movu m1, [r0]
+ movu m3, [r0 + r1]
+ punpcklwd m0, m1, m3
+ pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
+ punpckhwd m1, m3
+ pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m2, m3, m4
+ pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
+ punpckhwd m3, m4
+ pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0 + r1]
+ punpcklwd m6, m4, m5
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
+ paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
+ punpckhwd m4, m5
+ pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
+ paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m6, m5, m4
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
+ paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
+ punpckhwd m5, m4
+ pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
+ paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W8_H2 2
+INIT_XMM ssse3
+cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m7, [tab_c_526336]
+
+ mov r4d, %2/2
+.loopH
+ PROCESS_CHROMA_SP_W8_2R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m2
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W8_H2 8, 2
+FILTER_VER_CHROMA_SP_W8_H2 8, 4
+FILTER_VER_CHROMA_SP_W8_H2 8, 6
+FILTER_VER_CHROMA_SP_W8_H2 8, 8
+FILTER_VER_CHROMA_SP_W8_H2 8, 16
+FILTER_VER_CHROMA_SP_W8_H2 8, 32
More information about the x265-devel
mailing list