[x265] [PATCH] asm: pixelsub_ps routine for all block sizes

chen chenm003 at 163.com
Tue Nov 12 08:31:20 CET 2013


>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x4, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
>+
>+add         r1,    r1
>+
>+movd        m0,    [r2]
>+pinsrw      m0,    [r2 + r4],    2
>+movd        m1,    [r2 + 2 * r4]
>+
>+movd        m2,    [r3]
>+pinsrw      m2,    [r3 + r5],    2
>+movd        m3,    [r3 + 2 * r5]
>+
>+lea         r2,    [r2 + 2 * r4]
>+lea         r3,    [r3 + 2 * r5]
>+
>+pinsrw      m1,    [r2 + r4],    2
>+pinsrw      m3,    [r3 + r5],    2
>+
>+pmovzxbw    m0,    m0
>+pmovzxbw    m1,    m1
>+pmovzxbw    m2,    m2
>+pmovzxbw    m3,    m3
>+
>+psubw       m0,    m2
>+psubw       m1,    m3
>+
>+movd      [r0],            m0
>+pextrd    [r0 + r1],       m0,    2
>+movd      [r0 + 2* r1],    m1
>+
>+lea     r0,              [r0 + 2 * r1]
>+
>+pextrd    [r0 + r1],       m1,    2
>+
>+RET
>+

 
example:
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64
cglobal pixel_sub_ps_2x4, 6, 8, 0
  %define tmp_r1  r1
  DECLARE_REG_TMP 6,7
%else ; !ARCH_X86_64
cglobal pixel_sub_ps_2x4, 6, 7, 0, 0-4
  %define tmp_r1  dword [rsp]
  DECLARE_REG_TMP 6,1
%endif ; ARCH_X86_64

    add     r1, r1
%if ARCH_X86_64 == 0
    mov     tmp_r1, r1
%endif
    ; row 0
    movzx   t0d, byte [r2]
    movzx   t1d, byte [r3]
    sub     t0d, t1d
    mov     [r0], t0w
    movzx   t0d, byte [r2 + 1]
    movzx   t1d, byte [r3 + 1]
    sub     t0d, t1d
    mov     [r0 + 2], t0w
    add     r0, tmp_r1
    ; row 1
    movzx   t0d, byte [r2 + r4]
    movzx   t1d, byte [r3 + r5]
    sub     t0d, t1d
    mov     [r0], t0w
    movzx   t0d, byte [r2 + r4 + 1]
    movzx   t1d, byte [r3 + r5 + 1]
    sub     t0d, t1d
    mov     [r0 + 2], t0w
    add     r0, tmp_r1
    ; row 2
    movzx   t0d, byte [r2 + r4 * 2]
    movzx   t1d, byte [r3 + r5 * 2]
    sub     t0d, t1d
    mov     [r0], t0w
    movzx   t0d, byte [r2 + r4 * 2 + 1]
    movzx   t1d, byte [r3 + r5 * 2 + 1]
    sub     t0d, t1d
    mov     [r0 + 2], t0w
    add     r0, tmp_r1
    ; row 3
    lea     r2, [r2 + r4 * 2]
    lea     r3, [r3 + r5 * 2]
    movzx   t0d, byte [r2]
    movzx   t1d, byte [r3]
    sub     t0d, t1d
    mov     [r0], t0w
    movzx   t0d, byte [r2 + 1]
    movzx   t1d, byte [r3 + 1]
    sub     t0d, t1d
    mov     [r0 + 2], t0w
    RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131112/b4e670c9/attachment-0001.html>


More information about the x265-devel mailing list