[x265] [PATCH] asm: pixelsub_ps routine for all block sizes
chen
chenm003 at 163.com
Tue Nov 12 08:31:20 CET 2013
>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x4, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
>+
>+add r1, r1
>+
>+movd m0, [r2]
>+pinsrw m0, [r2 + r4], 2
>+movd m1, [r2 + 2 * r4]
>+
>+movd m2, [r3]
>+pinsrw m2, [r3 + r5], 2
>+movd m3, [r3 + 2 * r5]
>+
>+lea r2, [r2 + 2 * r4]
>+lea r3, [r3 + 2 * r5]
>+
>+pinsrw m1, [r2 + r4], 2
>+pinsrw m3, [r3 + r5], 2
>+
>+pmovzxbw m0, m0
>+pmovzxbw m1, m1
>+pmovzxbw m2, m2
>+pmovzxbw m3, m3
>+
>+psubw m0, m2
>+psubw m1, m3
>+
>+movd [r0], m0
>+pextrd [r0 + r1], m0, 2
>+movd [r0 + 2* r1], m1
>+
>+lea r0, [r0 + 2 * r1]
>+
>+pextrd [r0 + r1], m1, 2
>+
>+RET
>+
example:
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64
cglobal pixel_sub_ps_2x4, 6, 8, 0
%define tmp_r1 r1
DECLARE_REG_TMP 6,7
%else ; !ARCH_X86_64
cglobal pixel_sub_ps_2x4, 6, 7, 0, 0-4
%define tmp_r1 dword [rsp]
DECLARE_REG_TMP 6,1
%endif ; ARCH_X86_64
add r1, r1
%if ARCH_X86_64 == 0
mov tmp_r1, r1
%endif
; row 0
movzx t0d, byte [r2]
movzx t1d, byte [r3]
sub t0d, t1d
mov [r0], t0w
movzx t0d, byte [r2 + 1]
movzx t1d, byte [r3 + 1]
sub t0d, t1d
mov [r0 + 2], t0w
add r0, tmp_r1
; row 1
movzx t0d, byte [r2 + r4]
movzx t1d, byte [r3 + r5]
sub t0d, t1d
mov [r0], t0w
movzx t0d, byte [r2 + r4 + 1]
movzx t1d, byte [r3 + r5 + 1]
sub t0d, t1d
mov [r0 + 2], t0w
add r0, tmp_r1
; row 2
movzx t0d, byte [r2 + r4 * 2]
movzx t1d, byte [r3 + r5 * 2]
sub t0d, t1d
mov [r0], t0w
movzx t0d, byte [r2 + r4 * 2 + 1]
movzx t1d, byte [r3 + r5 * 2 + 1]
sub t0d, t1d
mov [r0 + 2], t0w
add r0, tmp_r1
; row 3
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movzx t0d, byte [r2]
movzx t1d, byte [r3]
sub t0d, t1d
mov [r0], t0w
movzx t0d, byte [r2 + 1]
movzx t1d, byte [r3 + 1]
sub t0d, t1d
mov [r0 + 2], t0w
RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131112/b4e670c9/attachment-0001.html>
More information about the x265-devel
mailing list