[x265] [PATCH Review Only] filterHorizontal_p_p_4, 24x24 asm code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Oct 8 09:15:53 CEST 2013
# HG changeset patch
# User Praveen Tiwari
# Date 1381216535 -19800
# Node ID e5b94aa4444dc927ef38236365cbe3bd757e9eb4
# Parent c58f23e73ffafac74e05696dc68f0b0b21524c05
filterHorizontal_p_p_4, 24x24 asm code
diff -r c58f23e73ffa -r e5b94aa4444d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Oct 08 12:31:26 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Oct 08 12:45:35 2013 +0530
@@ -373,3 +373,78 @@
FILTER_H4_w16 x0, x1, x2, x3
movu [dstq], x1
RET
+
+ SECTION_RODATA 32
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+tab_c_512: times 8 dw 512
+
+SECTION .text
+
+%macro FILTER_H4_w24 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], x1
+ movu %1, [srcq - 1 + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+%endmacro
+
+%macro FILTER_H4_w24_CALL 0
+ FILTER_H4_w24 x0, x1, x2, x3
+
+ movh [dstq + 16], x1
+
+ add srcq, srcstrideq
+ add dstq, dststrideq
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal filterHorizontal_p_p_4, 4, 5, 6, src, srcstride, dst, dststride
+%define coef2 m6
+%define Tm0 m5
+%define Tm1 m4
+%define x3 m3
+%define x2 m2
+%define x1 m1
+%define x0 m0
+
+ mov r4, r6m
+ movu coef2, [r4]
+ packsswb coef2, coef2
+ pshufd coef2, coef2, 0
+
+ mova x2, [tab_c_512]
+
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+ %rep 23
+ FILTER_H4_w24_CALL
+ %endrep
+
+ FILTER_H4_w24 x0, x1, x2, x3
+ movh [dstq + 16], x1
+ RET
More information about the x265-devel
mailing list