[x265] [PATCH] asm code for luma filter functions
Steve Borho
steve at borho.org
Fri Oct 18 21:51:32 CEST 2013
On Fri, Oct 18, 2013 at 5:48 AM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1382093292 -19800
> # Node ID 0efb3f85325f03edb436b260ba28189b8eae6b3f
> # Parent 975b3d17baa8339f77a3b99245136e8c06cf3fdb
> asm code for luma filter functions
>
I've folded a few patches together, but these changes are all queued.
Thanks for persevering; the new 4tap and 8tap routines have good
performance and I think we have a solid foundation on which to add the rest
of the interpolation primitives, and perhaps do some XOP/AVX/AVX2
optimizations of the existing macros in the future.
The next step is to wire up these new primitives into the motion
compensation routines. Then on to the vertical interpolation and the ps/sp
varieties.
>
> diff -r 975b3d17baa8 -r 0efb3f85325f source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Fri Oct 18 15:58:29 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Fri Oct 18 16:18:12 2013
> +0530
> @@ -279,6 +279,7 @@
> SA8D_INTER_FROM_BLOCK(sse4);
>
> CHROMA_FILTERS(_sse4);
> + LUMA_FILTERS(_sse4);
> }
> if (cpuMask & X265_CPU_AVX)
> {
> diff -r 975b3d17baa8 -r 0efb3f85325f source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Fri Oct 18 15:58:29 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm Fri Oct 18 16:18:12 2013 +0530
> @@ -30,6 +30,11 @@
> tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
> db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
>
> +tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
> + db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
> + db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
> + db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
> +
> tab_c_512: times 8 dw 512
>
> tab_coeff: db 0, 64, 0, 0
> @@ -41,6 +46,12 @@
> db -2, 16, 54, -4
> db -2, 10, 58, -2
>
> +tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
> + db -1, 4, -10, 58, 17, -5, 1, 0
> + db -1, 4, -11, 40, 40, -11, 4, -1
> + db 0, 1, -5, 17, 58, -10, 4, -1
> +
> +
> SECTION .text
>
> %macro FILTER_H4_w2_2 3
> @@ -469,3 +480,116 @@
> IPFILTER_CHROMA_W 32, 16
> IPFILTER_CHROMA_W 32, 24
> IPFILTER_CHROMA_W 32, 32
> +
> +
> +%macro FILTER_H8_W8 3
> + movu %1, [r0 - 3 + r5]
> + pshufb %2, %1, [tab_Lm]
> + pmaddubsw %2, m3
> + pshufb m7, %1, [tab_Lm + 16]
> + pmaddubsw m7, m3
> + phaddw %2, m7
> + pshufb m7, %1, [tab_Lm + 32]
> + pmaddubsw m7, m3
> + pshufb %1, %1, [tab_Lm + 48]
> + pmaddubsw %1, m3
> + phaddw m7, %1
> + phaddw %2, m7
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + movh [r2 + r5], %2
> +%endmacro
> +
> +%macro FILTER_H8_W4 3
> + movu %1, [r0 - 3 + r5]
> + pshufb %2, %1, [tab_Lm]
> + pmaddubsw %2, m3
> + pshufb m7, %1, [tab_Lm + 16]
> + pmaddubsw m7, m3
> + phaddw %2, m7
> + phaddw %2, %2
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + movd [r2 + r5], %2
> +%endmacro
> +
> +%macro FILTER_H8_W1 3
> + movu %1, [r0 - 3 + r5]
> + pshufb %2, %1, [tab_Lm]
> + pmaddubsw %2, m3
> + phaddw %2, %2
> + phaddw %2, %2
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + pextrb [r2 + r5], %2, 0
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx)
>
> +;-----------------------------------------------------------------------------
> +%macro IPFILTER_LUMA 2
> +cglobal interp_8tap_horiz_pp_%1x%2, 4, 6, 5
> +
> +mov r4d, r4m
> +
> +%ifdef PIC
> +lea r5, [tab_LumaCoeff]
> +movh m3, [r5 + r4 * 8]
> +%else
> +movh m3, [tab_LumaCoeff + r4 * 8]
> +%endif
> +
> +punpcklqdq m3, m3
> +mova m2, [tab_c_512]
> +mov r4, %2
> +
> +.loop
> + xor r5, r5
> +%rep %1 / 8
> + FILTER_H8_W8 m0, m1, m2
> + add r5, 8
> +%endrep
> +
> +%rep (%1 % 8) / 4
> + FILTER_H8_W4 m0, m1, m2
> + add r5, 4
> +%endrep
> +
> + %rep(%1 % 4)
> + FILTER_H8_W1 m0, m1, m2
> + add r5, 1
> + %endrep
> +
> + add r0, r1
> + add r2, r3
> +
> + dec r4d
> + jnz .loop
> + RET
> +%endmacro
> +
> + IPFILTER_LUMA 4, 4
> + IPFILTER_LUMA 8, 8
> + IPFILTER_LUMA 8, 4
> + IPFILTER_LUMA 4, 8
> + IPFILTER_LUMA 16, 16
> + IPFILTER_LUMA 16, 8
> + IPFILTER_LUMA 8, 16
> + IPFILTER_LUMA 16, 12
> + IPFILTER_LUMA 12, 16
> + IPFILTER_LUMA 16, 4
> + IPFILTER_LUMA 4, 16
> + IPFILTER_LUMA 32, 32
> + IPFILTER_LUMA 32, 16
> + IPFILTER_LUMA 16, 32
> + IPFILTER_LUMA 32, 24
> + IPFILTER_LUMA 24, 32
> + IPFILTER_LUMA 32, 8
> + IPFILTER_LUMA 8, 32
> + IPFILTER_LUMA 64, 64
> + IPFILTER_LUMA 64, 32
> + IPFILTER_LUMA 32, 64
> + IPFILTER_LUMA 64, 48
> + IPFILTER_LUMA 48, 64
> + IPFILTER_LUMA 64, 16
> + IPFILTER_LUMA 16, 64
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131018/0dcb9bff/attachment-0001.html>
More information about the x265-devel
mailing list