[x265] [PATCH 1 of 2] asm: routines for luma hps filter functions for all block sizes
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Wed Nov 6 11:31:22 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1383732753 -19800
# Wed Nov 06 15:42:33 2013 +0530
# Node ID 96a46cf4a3b723d58eb8efffbc82acf8055b43f9
# Parent bab35592e71ceac541bba5fa34eac9d657dcd7cf
asm: routines for luma hps filter functions for all block sizes.
diff -r bab35592e71c -r 96a46cf4a3b7 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Nov 06 13:06:15 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Nov 06 15:42:33 2013 +0530
@@ -583,7 +583,7 @@
%endif
%endmacro
-%macro FILTER_H8_W4 3
+%macro FILTER_H8_W4 2
movu %1, [r0 - 3 + r5]
pshufb %2, %1, [tab_Lm]
pmaddubsw %2, m3
@@ -591,92 +591,127 @@
pmaddubsw m7, m3
phaddw %2, m7
phaddw %2, %2
- pmulhrsw %2, %3
- packuswb %2, %2
- movd [r2 + r5], %2
%endmacro
-%macro FILTER_H8_W1 3
- movu %1, [r0 - 3 + r5]
- pshufb %2, %1, [tab_Lm]
- pmaddubsw %2, m3
- phaddw %2, %2
- phaddw %2, %2
- pmulhrsw %2, %3
- packuswb %2, %2
- pextrb [r2 + r5], %2, 0
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro IPFILTER_LUMA 2
-cglobal interp_8tap_horiz_pp_%1x%2, 4, 6, 5
-
-mov r4d, r4m
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 6, 5
+
+mov r4d, r4m
%ifdef PIC
-lea r5, [tab_LumaCoeff]
-movh m3, [r5 + r4 * 8]
+lea r5, [tab_LumaCoeff]
+movh m3, [r5 + r4 * 8]
%else
-movh m3, [tab_LumaCoeff + r4 * 8]
+movh m3, [tab_LumaCoeff + r4 * 8]
%endif
-punpcklqdq m3, m3
-mova m2, [tab_c_512]
-mov r4, %2
+%ifidn %3, ps
+ add r3d, r3d
+%endif
+
+punpcklqdq m3, m3
+%ifidn %3, pp
+ mova m2, [tab_c_512]
+%else
+ mova m2, [tab_c_8192]
+%endif
+
+mov r4, %2
.loop
- xor r5, r5
+ xor r5, r5
%rep %1 / 8
+ %ifidn %3, pp
FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
- add r5, 8
+ %else
+ FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
+ psubw m1, m2
+ movu [r2 + 2 * r5], m1
+ %endif
+ add r5, 8
%endrep
%rep (%1 % 8) / 4
- FILTER_H8_W4 m0, m1, m2
- add r5, 4
+ FILTER_H8_W4 m0, m1
+ %ifidn %3, pp
+ pmulhrsw m1, m2
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ %else
+ psubw m1, m2
+ movh [r2 + 2 * r5], m1
+ %endif
%endrep
- %rep(%1 % 4)
- FILTER_H8_W1 m0, m1, m2
- add r5, 1
- %endrep
-
- add r0, r1
- add r2, r3
+ add r0, r1
+ add r2, r3
dec r4d
jnz .loop
RET
%endmacro
- IPFILTER_LUMA 4, 4
- IPFILTER_LUMA 8, 8
- IPFILTER_LUMA 8, 4
- IPFILTER_LUMA 4, 8
- IPFILTER_LUMA 16, 16
- IPFILTER_LUMA 16, 8
- IPFILTER_LUMA 8, 16
- IPFILTER_LUMA 16, 12
- IPFILTER_LUMA 12, 16
- IPFILTER_LUMA 16, 4
- IPFILTER_LUMA 4, 16
- IPFILTER_LUMA 32, 32
- IPFILTER_LUMA 32, 16
- IPFILTER_LUMA 16, 32
- IPFILTER_LUMA 32, 24
- IPFILTER_LUMA 24, 32
- IPFILTER_LUMA 32, 8
- IPFILTER_LUMA 8, 32
- IPFILTER_LUMA 64, 64
- IPFILTER_LUMA 64, 32
- IPFILTER_LUMA 32, 64
- IPFILTER_LUMA 64, 48
- IPFILTER_LUMA 48, 64
- IPFILTER_LUMA 64, 16
- IPFILTER_LUMA 16, 64
-
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA 4, 4, pp
+ IPFILTER_LUMA 8, 8, pp
+ IPFILTER_LUMA 8, 4, pp
+ IPFILTER_LUMA 4, 8, pp
+ IPFILTER_LUMA 16, 16, pp
+ IPFILTER_LUMA 16, 8, pp
+ IPFILTER_LUMA 8, 16, pp
+ IPFILTER_LUMA 16, 12, pp
+ IPFILTER_LUMA 12, 16, pp
+ IPFILTER_LUMA 16, 4, pp
+ IPFILTER_LUMA 4, 16, pp
+ IPFILTER_LUMA 32, 32, pp
+ IPFILTER_LUMA 32, 16, pp
+ IPFILTER_LUMA 16, 32, pp
+ IPFILTER_LUMA 32, 24, pp
+ IPFILTER_LUMA 24, 32, pp
+ IPFILTER_LUMA 32, 8, pp
+ IPFILTER_LUMA 8, 32, pp
+ IPFILTER_LUMA 64, 64, pp
+ IPFILTER_LUMA 64, 32, pp
+ IPFILTER_LUMA 32, 64, pp
+ IPFILTER_LUMA 64, 48, pp
+ IPFILTER_LUMA 48, 64, pp
+ IPFILTER_LUMA 64, 16, pp
+ IPFILTER_LUMA 16, 64, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA 4, 4, ps
+ IPFILTER_LUMA 8, 8, ps
+ IPFILTER_LUMA 8, 4, ps
+ IPFILTER_LUMA 4, 8, ps
+ IPFILTER_LUMA 16, 16, ps
+ IPFILTER_LUMA 16, 8, ps
+ IPFILTER_LUMA 8, 16, ps
+ IPFILTER_LUMA 16, 12, ps
+ IPFILTER_LUMA 12, 16, ps
+ IPFILTER_LUMA 16, 4, ps
+ IPFILTER_LUMA 4, 16, ps
+ IPFILTER_LUMA 32, 32, ps
+ IPFILTER_LUMA 32, 16, ps
+ IPFILTER_LUMA 16, 32, ps
+ IPFILTER_LUMA 32, 24, ps
+ IPFILTER_LUMA 24, 32, ps
+ IPFILTER_LUMA 32, 8, ps
+ IPFILTER_LUMA 8, 32, ps
+ IPFILTER_LUMA 64, 64, ps
+ IPFILTER_LUMA 64, 32, ps
+ IPFILTER_LUMA 32, 64, ps
+ IPFILTER_LUMA 64, 48, ps
+ IPFILTER_LUMA 48, 64, ps
+ IPFILTER_LUMA 64, 16, ps
+ IPFILTER_LUMA 16, 64, ps
;-----------------------------------------------------------------------------
; Interpolate HV
More information about the x265-devel
mailing list