[x265] [PATCH Review Only] ASM routine for interp_8tap_vert_pp_8xN function, (N=4, 8, 16, 32)
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Tue Oct 29 14:54:19 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1383054838 -19800
# Tue Oct 29 19:23:58 2013 +0530
# Node ID 2c8344b73d62b97347109946266d7f2e0200daa6
# Parent 358400cb0c67c22a1f387f3af86966447ba884fa
ASM routine for interp_8tap_vert_pp_8xN function, (N=4,8,16,32)
diff -r 358400cb0c67 -r 2c8344b73d62 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Oct 29 11:33:01 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Oct 29 19:23:58 2013 +0530
@@ -73,6 +73,26 @@
times 4 dw 58, -10
times 4 dw 4, -1
+tab_LumaCoeffVL:times 8 db 0, 0
+ times 8 db 0, 64
+ times 8 db 0, 0
+ times 8 db 0, 0
+
+ times 8 db -1, 4
+ times 8 db -10, 58
+ times 8 db 17, -5
+ times 8 db 1, 0
+
+ times 8 db -1, 4
+ times 8 db -11, 40
+ times 8 db 40, -11
+ times 8 db 4, -1
+
+ times 8 db 0, 1
+ times 8 db -5, 17
+ times 8 db 58, -10
+ times 8 db 4, -1
+
SECTION .text
%macro FILTER_H4_w2_2 3
@@ -898,3 +918,122 @@
jnz .loopH
RET
+
+;This macro is used by vertical luma filter to calculate 4 output rows for input width of 8.
+%macro FILTER_VL_W8_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m7, m0, [r6] ;m7 = [0+1]
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m6, m1, [r6] ;m6 = [1+2]
+
+ lea r0, [r0 + 2 * r1]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m5, m0, [r6] ;m5 = [2+3]
+ pmaddubsw m0, [r6 + 16]
+ paddw m7, m0 ;m7 = [0+1+2+3]
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m4, m1, [r6] ;m4 = [3+4]
+ pmaddubsw m1, [r6 + 16]
+ paddw m6, m1 ;m6 = [1+2+3+4]
+
+ lea r0, [r0 + 2 * r1]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m2, m0, [r6 + 16]
+ pmaddubsw m0, [r6 + 32]
+ paddw m7, m0 ;m7 = [0+1+2+3+4+5]
+ paddw m5, m2 ;m5 = [2+3+4+5]
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m2, m1, [r6 + 16]
+ pmaddubsw m1, [r6 + 32]
+ paddw m6, m1 ;m6 = [1+2+3+4+5+6]
+ paddw m4, m2 ;m4 = [3+4+5+6]
+
+ lea r0, [r0 + 2 * r1]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m2, m0, [r6 + 32]
+ pmaddubsw m0, [r6 + 48]
+ paddw m7, m0 ;m7 = [0+1+2+3+4+5+6+7]
+ paddw m5, m2 ;m5 = [2+3+4+5+6+7]
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m2, m1, [r6 + 32]
+ pmaddubsw m1, [r6 + 48]
+ paddw m6, m1 ;m6 = [1+2+3+4+5+6+7+8]
+ paddw m4, m2 ;m4 = [3+4+5+6+7+8]
+
+ lea r0, [r0 + 2 * r1]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m0, [r6 + 48]
+ paddw m5, m0 ;m5 = [2+3+4+5+6+7+8+9]
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m1, [r6 + 48]
+ paddw m4, m1 ;m4 = [3+4+5+6+7+8+9+10]
+
+ pmulhrsw m7, [tab_c_512]
+ pmulhrsw m6, [tab_c_512]
+ pmulhrsw m5, [tab_c_512]
+ pmulhrsw m4, [tab_c_512]
+
+ packuswb m7, m6
+ packuswb m5, m4
+
+ movlps [r2], m7
+ movhps [r2 + r3], m7
+ movlps [r2 + 2 * r3], m5
+ lea r5, [r3 + 2 * r3]
+ movhps [r2 + r5], m5
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V_LUMA_8xN 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 7
+ mov r4d, r4m
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+
+ shl r4, 6
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVL]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffVL + r4]
+%endif
+
+xor r4, r4
+add r4d, %2
+
+.loopH
+ FILTER_VL_W8_4R
+
+ lea r5, [4 * r1]
+ sub r0, r5
+ lea r5, [4 * r3]
+ add r2, r5
+
+ sub r4, 4
+ jnz .loopH
+ RET
+%endmacro
+
+ FILTER_V_LUMA_8xN 8,4
+ FILTER_V_LUMA_8xN 8,8
+ FILTER_V_LUMA_8xN 8,16
+ FILTER_V_LUMA_8xN 8,32
More information about the x265-devel
mailing list