[x265] primitives for RExt
Satoshi Nakagawa
nakagawa424 at oki.com
Wed Aug 6 06:09:42 CEST 2014
>>- mov byte [rsp], %2/4
>>+ mov dword [rsp], %2/4
>Why dword? byte is enough for dynamic range
partial write needs read-modify-write.
>>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1,
srcStride0, srcStride1
>pinsrw have 2 uops, movd to load 4 bytes and drop unused is better.
thanks.
this function is not used, and will be removed.
From: x265-devel [mailto:x265-devel-bounces at videolan.org] On Behalf Of chen
Sent: Wednesday, August 06, 2014 3:29 AM
To: Development for x265
Subject: Re: [x265] primitives for RExt
At 2014-08-05 20:48:50,"Satoshi Nakagawa" <nakagawa424 at oki.com> wrote:
># HG changeset patch
># User Satoshi Nakagawa <nakagawa424 at oki.com>
># Date 1407242513 -32400
># Tue Aug 05 21:41:53 2014 +0900
># Node ID 770c40d768d55e68e76c485d5dc61d014257e789
># Parent 0d4723a0080cff763ff20ab9c516c6e082496a0b
>primitives for RExt
>
>@@ -1494,7 +1599,7 @@
>
;---------------------------------------------------------------------------
--------------------------------------
> %macro FILTER_VER_CHROMA_SS 4
> INIT_XMM sse2
>-cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-1
>+cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize
>
> add r1d, r1d
> add r3d, r3d
>@@ -1508,7 +1613,7 @@
> lea r6, [tab_ChromaCoeffV + r4]
> %endif
>
>- mov byte [rsp], %2/4
>+ mov dword [rsp], %2/4
Why dword? byte is enough for dynamic range
>diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Tue Aug 05 01:05:47 2014
-0500
>+++ b/source/common/x86/pixel-util8.asm Tue Aug 05 21:41:53 2014
+0900
>@@ -2878,6 +2878,61 @@
> RET
>
>
;---------------------------------------------------------------------------
--
>+; void pixel_sub_ps_2x%2(pixel *dest, intptr_t destride, pixel *src0,
int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
>+;-------------------------------------------------------------------------
----
>+%macro PIXEL_SUB_PS_W2_H2 2
>+%if HIGH_BIT_DEPTH
>+INIT_XMM sse2
>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1,
srcStride0, srcStride1
>+ add r1, r1
>+ add r4, r4
>+ add r5, r5
>+ mov r6d, %2/2
>+.loop:
>+ movd m0, [r2]
>+ movd m1, [r3]
>+ movd m2, [r2 + r4]
>+ movd m3, [r3 + r5]
>+ dec r6d
>+ lea r2, [r2 + r4 * 2]
>+ lea r3, [r3 + r5 * 2]
>+ psubw m0, m1
>+ psubw m2, m3
>+ movd [r0], m0
>+ movd [r0 + r1], m2
>+ lea r0, [r0 + 2 * r1]
>+ jnz .loop
>+ RET
>+%else
>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1,
srcStride0, srcStride1
>+ add r1, r1
>+ mov r6d, %2/2
>+.loop:
>+ pinsrw m0, [r2], 0
>+ pinsrw m1, [r3], 0
>+ pinsrw m2, [r2 + r4], 0
>+ pinsrw m3, [r3 + r5], 0
pinsrw have 2 uops, movd to load 4 bytes and drop unused is better.
>+ dec r6d
>+ lea r2, [r2 + r4 * 2]
>+ lea r3, [r3 + r5 * 2]
>+ pmovzxbw m0, m0
>+ pmovzxbw m1, m1
>+ pmovzxbw m2, m2
>+ pmovzxbw m3, m3
>+ psubw m0, m1
>+ psubw m2, m3
>+ movd [r0], m0
>+ movd [r0 + r1], m2
>+ lea r0, [r0 + r1 * 2]
>+ jnz .loop
>+ RET
>+%endif
>+%endmacro
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140806/4be4bf51/attachment.html>
More information about the x265-devel
mailing list