[x265] [PATCH] asm: removed unused code from pixel-a.asm

Steve Borho steve at borho.org
Mon Dec 2 08:16:55 CET 2013


On Mon, Dec 2, 2013 at 12:49 AM, <murugan at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1385966974 -19800
> #      Mon Dec 02 12:19:34 2013 +0530
> # Node ID 1695371f63a6cdef5ece9d17f94b286fc17cc29e
> # Parent  ace5b9ee099d1539f020e68971a27577148a4a29
> asm: removed unused code from pixel-a.asm
>

queued.

For clarity, we should also prune unused function definitions from pixel.h
- at least the functions that have been removed completely or never used by
x265


>
> diff -r ace5b9ee099d -r 1695371f63a6 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm     Mon Dec 02 11:25:00 2013 +0530
> +++ b/source/common/x86/pixel-a.asm     Mon Dec 02 12:19:34 2013 +0530
> @@ -2242,182 +2242,6 @@
>      movd     eax,    m7
>      RET
>
>
> -;-----------------------------------------------------------------------------
> -; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t
> *pixuv2, intptr_t stride2,
> -;                           int width, int height, uint64_t *ssd_u,
> uint64_t *ssd_v )
> -;
> -; The maximum width this function can handle without risk of overflow is
> given
> -; in the following equation: (mmsize in bits)
> -;
> -;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
> -;
> -; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
> -; distortion levels it will take much more than that though.
>
> -;-----------------------------------------------------------------------------
> -%if HIGH_BIT_DEPTH
> -%macro SSD_NV12 0
> -cglobal pixel_ssd_nv12_core, 6,7,7
> -    shl        r4d, 2
> -    FIX_STRIDES r1, r3
> -    add         r0, r4
> -    add         r2, r4
> -    xor         r6, r6
> -    pxor        m4, m4
> -    pxor        m5, m5
> -    pxor        m6, m6
> -.loopy:
> -    mov         r6, r4
> -    neg         r6
> -    pxor        m2, m2
> -    pxor        m3, m3
> -.loopx:
> -    mova        m0, [r0+r6]
> -    mova        m1, [r0+r6+mmsize]
> -    psubw       m0, [r2+r6]
> -    psubw       m1, [r2+r6+mmsize]
> -    PSHUFLW     m0, m0, q3120
> -    PSHUFLW     m1, m1, q3120
> -%if mmsize >= 16
> -    pshufhw     m0, m0, q3120
> -    pshufhw     m1, m1, q3120
> -%endif
> -    pmaddwd     m0, m0
> -    pmaddwd     m1, m1
> -    paddd       m2, m0
> -    paddd       m3, m1
> -    add         r6, 2*mmsize
> -    jl .loopx
> -%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
> -    jz .no_overread
> -    psubd       m3, m1
> -.no_overread:
> -%endif
> -%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
> -                 ; equation above, putting the width limit at 8208
> -    punpckhdq   m0, m2, m6
> -    punpckhdq   m1, m3, m6
> -    punpckldq   m2, m6
> -    punpckldq   m3, m6
> -    paddq       m3, m2
> -    paddq       m1, m0
> -    paddq       m4, m3
> -    paddq       m4, m1
> -%else ; unfortunately paddq is sse2
> -      ; emulate 48 bit precision for mmx2 instead
> -    mova        m0, m2
> -    mova        m1, m3
> -    punpcklwd   m2, m6
> -    punpcklwd   m3, m6
> -    punpckhwd   m0, m6
> -    punpckhwd   m1, m6
> -    paddd       m3, m2
> -    paddd       m1, m0
> -    paddd       m4, m3
> -    paddd       m5, m1
> -%endif
> -    add         r0, r1
> -    add         r2, r3
> -    dec        r5d
> -    jg .loopy
> -    mov         r3, r6m
> -    mov         r4, r7m
> -%if mmsize == 32
> -    vextracti128 xm0, m4, 1
> -    paddq      xm4, xm0
> -%endif
> -%if mmsize >= 16
> -    movq      [r3], xm4
> -    movhps    [r4], xm4
> -%else ; fixup for mmx2
> -    SBUTTERFLY dq, 4, 5, 0
> -    mova        m0, m4
> -    psrld       m4, 16
> -    paddd       m5, m4
> -    pslld       m0, 16
> -    SBUTTERFLY dq, 0, 5, 4
> -    psrlq       m0, 16
> -    psrlq       m5, 16
> -    movq      [r3], m0
> -    movq      [r4], m5
> -%endif
> -    RET
> -%endmacro ; SSD_NV12
> -%endif ; HIGH_BIT_DEPTH
> -
> -%if HIGH_BIT_DEPTH == 0
>
> -;-----------------------------------------------------------------------------
> -; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t
> *pixuv2, intptr_t stride2,
> -;                           int width, int height, uint64_t *ssd_u,
> uint64_t *ssd_v )
> -;
> -; This implementation can potentially overflow on image widths >= 11008
> (or
> -; 6604 if interlaced), since it is called on blocks of height up to 12
> (resp
> -; 20). At sane distortion levels it will take much more than that though.
>
> -;-----------------------------------------------------------------------------
> -%macro SSD_NV12 0
> -cglobal pixel_ssd_nv12_core, 6,7
> -    add    r4d, r4d
> -    add     r0, r4
> -    add     r2, r4
> -    pxor    m3, m3
> -    pxor    m4, m4
> -    mova    m5, [pw_00ff]
> -.loopy:
> -    mov     r6, r4
> -    neg     r6
> -.loopx:
> -%if mmsize == 32 ; only 16-byte alignment is guaranteed
> -    movu    m2, [r0+r6]
> -    movu    m1, [r2+r6]
> -%else
> -    mova    m2, [r0+r6]
> -    mova    m1, [r2+r6]
> -%endif
> -    psubusb m0, m2, m1
> -    psubusb m1, m2
> -    por     m0, m1
> -    psrlw   m2, m0, 8
> -    pand    m0, m5
> -    pmaddwd m2, m2
> -    pmaddwd m0, m0
> -    paddd   m3, m0
> -    paddd   m4, m2
> -    add     r6, mmsize
> -    jl .loopx
> -%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
> -    jz .no_overread
> -    pcmpeqb xm1, xm1
> -    pandn   m0, m1, m0 ; zero the lower half
> -    pandn   m2, m1, m2
> -    psubd   m3, m0
> -    psubd   m4, m2
> -.no_overread:
> -%endif
> -    add     r0, r1
> -    add     r2, r3
> -    dec    r5d
> -    jg .loopy
> -    mov     r3, r6m
> -    mov     r4, r7m
> -    HADDD   m3, m0
> -    HADDD   m4, m0
> -    pxor   xm0, xm0
> -    punpckldq xm3, xm0
> -    punpckldq xm4, xm0
> -    movq  [r3], xm3
> -    movq  [r4], xm4
> -    RET
> -%endmacro ; SSD_NV12
> -%endif ; !HIGH_BIT_DEPTH
> -
> -INIT_MMX mmx2
> -SSD_NV12
> -INIT_XMM sse2
> -SSD_NV12
> -INIT_XMM avx
> -SSD_NV12
> -INIT_YMM avx2
> -SSD_NV12
> -
>
>  ;=============================================================================
>  ; variance
>
>  ;=============================================================================
> @@ -2841,183 +2665,6 @@
>      RET
>  %endmacro
>
>
> -;-----------------------------------------------------------------------------
> -; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
>
> -;-----------------------------------------------------------------------------
> -%macro VAR2_8x8_MMX 2
> -cglobal pixel_var2_8x%1, 5,6
> -    FIX_STRIDES r1, r3
> -    VAR_START 0
> -    mov      r5d, %1
> -.loop:
> -%if HIGH_BIT_DEPTH
> -    mova      m0, [r0]
> -    mova      m1, [r0+mmsize]
> -    psubw     m0, [r2]
> -    psubw     m1, [r2+mmsize]
> -%else ; !HIGH_BIT_DEPTH
> -    movq      m0, [r0]
> -    movq      m1, m0
> -    movq      m2, [r2]
> -    movq      m3, m2
> -    punpcklbw m0, m7
> -    punpckhbw m1, m7
> -    punpcklbw m2, m7
> -    punpckhbw m3, m7
> -    psubw     m0, m2
> -    psubw     m1, m3
> -%endif ; HIGH_BIT_DEPTH
> -    paddw     m5, m0
> -    paddw     m5, m1
> -    pmaddwd   m0, m0
> -    pmaddwd   m1, m1
> -    paddd     m6, m0
> -    paddd     m6, m1
> -    add       r0, r1
> -    add       r2, r3
> -    dec       r5d
> -    jg .loop
> -    VAR2_END %2, m5, m6
> -%endmacro
> -
> -%if ARCH_X86_64 == 0
> -INIT_MMX mmx2
> -VAR2_8x8_MMX  8, 6
> -VAR2_8x8_MMX 16, 7
> -%endif
> -
> -%macro VAR2_8x8_SSE2 2
> -cglobal pixel_var2_8x%1, 5,6,8
> -    VAR_START 1
> -    mov      r5d, %1/2
> -.loop:
> -%if HIGH_BIT_DEPTH
> -    mova      m0, [r0]
> -    mova      m1, [r0+r1*2]
> -    mova      m2, [r2]
> -    mova      m3, [r2+r3*2]
> -%else ; !HIGH_BIT_DEPTH
> -    movq      m1, [r0]
> -    movhps    m1, [r0+r1]
> -    movq      m3, [r2]
> -    movhps    m3, [r2+r3]
> -    DEINTB    0, 1, 2, 3, 7
> -%endif ; HIGH_BIT_DEPTH
> -    psubw     m0, m2
> -    psubw     m1, m3
> -    paddw     m5, m0
> -    paddw     m5, m1
> -    pmaddwd   m0, m0
> -    pmaddwd   m1, m1
> -    paddd     m6, m0
> -    paddd     m6, m1
> -    lea       r0, [r0+r1*2*SIZEOF_PIXEL]
> -    lea       r2, [r2+r3*2*SIZEOF_PIXEL]
> -    dec      r5d
> -    jg .loop
> -    VAR2_END %2, m5, m6
> -%endmacro
> -
> -INIT_XMM sse2
> -VAR2_8x8_SSE2  8, 6
> -VAR2_8x8_SSE2 16, 7
> -
> -%if HIGH_BIT_DEPTH == 0
> -%macro VAR2_8x8_SSSE3 2
> -cglobal pixel_var2_8x%1, 5,6,8
> -    pxor      m5, m5    ; sum
> -    pxor      m6, m6    ; sum squared
> -    mova      m7, [hsub_mul]
> -    mov      r5d, %1/4
> -.loop:
> -    movq      m0, [r0]
> -    movq      m2, [r2]
> -    movq      m1, [r0+r1]
> -    movq      m3, [r2+r3]
> -    lea       r0, [r0+r1*2]
> -    lea       r2, [r2+r3*2]
> -    punpcklbw m0, m2
> -    punpcklbw m1, m3
> -    movq      m2, [r0]
> -    movq      m3, [r2]
> -    punpcklbw m2, m3
> -    movq      m3, [r0+r1]
> -    movq      m4, [r2+r3]
> -    punpcklbw m3, m4
> -    pmaddubsw m0, m7
> -    pmaddubsw m1, m7
> -    pmaddubsw m2, m7
> -    pmaddubsw m3, m7
> -    paddw     m5, m0
> -    paddw     m5, m1
> -    paddw     m5, m2
> -    paddw     m5, m3
> -    pmaddwd   m0, m0
> -    pmaddwd   m1, m1
> -    pmaddwd   m2, m2
> -    pmaddwd   m3, m3
> -    paddd     m6, m0
> -    paddd     m6, m1
> -    paddd     m6, m2
> -    paddd     m6, m3
> -    lea       r0, [r0+r1*2]
> -    lea       r2, [r2+r3*2]
> -    dec      r5d
> -    jg .loop
> -    VAR2_END %2, m5, m6
> -%endmacro
> -
> -INIT_XMM ssse3
> -VAR2_8x8_SSSE3  8, 6
> -VAR2_8x8_SSSE3 16, 7
> -INIT_XMM xop
> -VAR2_8x8_SSSE3  8, 6
> -VAR2_8x8_SSSE3 16, 7
> -
> -%macro VAR2_8x8_AVX2 2
> -cglobal pixel_var2_8x%1, 5,6,6
> -    pxor      m3, m3    ; sum
> -    pxor      m4, m4    ; sum squared
> -    mova      m5, [hsub_mul]
> -    mov      r5d, %1/4
> -.loop:
> -    movq     xm0, [r0]
> -    movq     xm1, [r2]
> -    vinserti128 m0, m0, [r0+r1], 1
> -    vinserti128 m1, m1, [r2+r3], 1
> -    lea       r0, [r0+r1*2]
> -    lea       r2, [r2+r3*2]
> -    punpcklbw m0, m1
> -    movq     xm1, [r0]
> -    movq     xm2, [r2]
> -    vinserti128 m1, m1, [r0+r1], 1
> -    vinserti128 m2, m2, [r2+r3], 1
> -    lea       r0, [r0+r1*2]
> -    lea       r2, [r2+r3*2]
> -    punpcklbw m1, m2
> -    pmaddubsw m0, m5
> -    pmaddubsw m1, m5
> -    paddw     m3, m0
> -    paddw     m3, m1
> -    pmaddwd   m0, m0
> -    pmaddwd   m1, m1
> -    paddd     m4, m0
> -    paddd     m4, m1
> -    dec      r5d
> -    jg .loop
> -    vextracti128 xm0, m3, 1
> -    vextracti128 xm1, m4, 1
> -    paddw    xm3, xm0
> -    paddd    xm4, xm1
> -    VAR2_END %2, xm3, xm4
> -%endmacro
> -
> -INIT_YMM avx2
> -VAR2_8x8_AVX2  8, 6
> -VAR2_8x8_AVX2 16, 7
> -
> -%endif ; !HIGH_BIT_DEPTH
> -
>
>  ;=============================================================================
>  ; SATD
>
>  ;=============================================================================
> @@ -9009,446 +8656,6 @@
>  INIT_XMM avx
>  SSIM
>
>
> -;-----------------------------------------------------------------------------
> -; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t
> stride2, int height );
>
> -;-----------------------------------------------------------------------------
> -%macro ASD8 0
> -cglobal pixel_asd8, 5,5
> -    pxor     m0, m0
> -    pxor     m1, m1
> -.loop:
> -%if HIGH_BIT_DEPTH
> -    paddw    m0, [r0]
> -    paddw    m1, [r2]
> -    paddw    m0, [r0+2*r1]
> -    paddw    m1, [r2+2*r3]
> -    lea      r0, [r0+4*r1]
> -    paddw    m0, [r0]
> -    paddw    m1, [r2+4*r3]
> -    lea      r2, [r2+4*r3]
> -    paddw    m0, [r0+2*r1]
> -    paddw    m1, [r2+2*r3]
> -    lea      r0, [r0+4*r1]
> -    lea      r2, [r2+4*r3]
> -%else
> -    movq     m2, [r0]
> -    movq     m3, [r2]
> -    movhps   m2, [r0+r1]
> -    movhps   m3, [r2+r3]
> -    lea      r0, [r0+2*r1]
> -    psadbw   m2, m1
> -    psadbw   m3, m1
> -    movq     m4, [r0]
> -    movq     m5, [r2+2*r3]
> -    lea      r2, [r2+2*r3]
> -    movhps   m4, [r0+r1]
> -    movhps   m5, [r2+r3]
> -    lea      r0, [r0+2*r1]
> -    paddw    m0, m2
> -    psubw    m0, m3
> -    psadbw   m4, m1
> -    psadbw   m5, m1
> -    lea      r2, [r2+2*r3]
> -    paddw    m0, m4
> -    psubw    m0, m5
> -%endif
> -    sub     r4d, 4
> -    jg .loop
> -%if HIGH_BIT_DEPTH
> -    psubw    m0, m1
> -    HADDW    m0, m1
> -    ABSD     m1, m0
> -%else
> -    movhlps  m1, m0
> -    paddw    m0, m1
> -    ABSW     m1, m0
> -%endif
> -    movd    eax, m1
> -    RET
> -%endmacro
> -
> -INIT_XMM sse2
> -ASD8
> -INIT_XMM ssse3
> -ASD8
> -%if HIGH_BIT_DEPTH
> -INIT_XMM xop
> -ASD8
> -%endif
> -
>
> -;=============================================================================
> -; Successive Elimination ADS
>
> -;=============================================================================
> -
> -%macro ADS_START 0
> -%if UNIX64
> -    movsxd  r5, r5d
> -%else
> -    mov    r5d, r5m
> -%endif
> -    mov    r0d, r5d
> -    lea     r6, [r4+r5+(mmsize-1)]
> -    and     r6, ~(mmsize-1)
> -    shl     r2d,  1
> -%endmacro
> -
> -%macro ADS_END 1 ; unroll_size
> -    add     r1, 8*%1
> -    add     r3, 8*%1
> -    add     r6, 4*%1
> -    sub    r0d, 4*%1
> -    jg .loop
> -    WIN64_RESTORE_XMM rsp
> -%if mmsize==32
> -    vzeroupper
> -%endif
> -    lea     r6, [r4+r5+(mmsize-1)]
> -    and     r6, ~(mmsize-1)
> -%if cpuflag(ssse3)
> -    jmp ads_mvs_ssse3
> -%else
> -    jmp ads_mvs_mmx
> -%endif
> -%endmacro
> -
>
> -;-----------------------------------------------------------------------------
> -; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
> -;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh
> )
>
> -;-----------------------------------------------------------------------------
> -INIT_MMX mmx2
> -cglobal pixel_ads4, 5,7
> -    mova    m6, [r0]
> -    mova    m4, [r0+8]
> -    pshufw  m7, m6, 0
> -    pshufw  m6, m6, q2222
> -    pshufw  m5, m4, 0
> -    pshufw  m4, m4, q2222
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+16]
> -    psubw     m0, m7
> -    psubw     m1, m6
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    movu      m2, [r1+r2]
> -    movu      m3, [r1+r2+16]
> -    psubw     m2, m5
> -    psubw     m3, m4
> -    paddw     m0, m1
> -    ABSW      m2, m2, m1
> -    ABSW      m3, m3, m1
> -    paddw     m0, m2
> -    paddw     m0, m3
> -    pshufw    m1, r6m, 0
> -    paddusw   m0, [r3]
> -    psubusw   m1, m0
> -    packsswb  m1, m1
> -    movd    [r6], m1
> -    ADS_END 1
> -
> -cglobal pixel_ads2, 5,7
> -    mova      m6, [r0]
> -    pshufw    m5, r6m, 0
> -    pshufw    m7, m6, 0
> -    pshufw    m6, m6, q2222
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+r2]
> -    psubw     m0, m7
> -    psubw     m1, m6
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    paddw     m0, m1
> -    paddusw   m0, [r3]
> -    mova      m4, m5
> -    psubusw   m4, m0
> -    packsswb  m4, m4
> -    movd    [r6], m4
> -    ADS_END 1
> -
> -cglobal pixel_ads1, 5,7
> -    pshufw    m7, [r0], 0
> -    pshufw    m6, r6m, 0
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+8]
> -    psubw     m0, m7
> -    psubw     m1, m7
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    paddusw   m0, [r3]
> -    paddusw   m1, [r3+8]
> -    mova      m4, m6
> -    mova      m5, m6
> -    psubusw   m4, m0
> -    psubusw   m5, m1
> -    packsswb  m4, m5
> -    mova    [r6], m4
> -    ADS_END 2
> -
> -%macro ADS_XMM 0
> -%if mmsize==32
> -cglobal pixel_ads4, 5,7,8
> -    vpbroadcastw m7, [r0+ 0]
> -    vpbroadcastw m6, [r0+ 4]
> -    vpbroadcastw m5, [r0+ 8]
> -    vpbroadcastw m4, [r0+12]
> -%else
> -cglobal pixel_ads4, 5,7,12
> -    mova      m4, [r0]
> -    pshuflw   m7, m4, q0000
> -    pshuflw   m6, m4, q2222
> -    pshufhw   m5, m4, q0000
> -    pshufhw   m4, m4, q2222
> -    punpcklqdq m7, m7
> -    punpcklqdq m6, m6
> -    punpckhqdq m5, m5
> -    punpckhqdq m4, m4
> -%endif
> -%if ARCH_X86_64 && mmsize == 16
> -    movd      m8, r6m
> -    SPLATW    m8, m8
> -    ADS_START
> -    movu     m10, [r1]
> -    movu     m11, [r1+r2]
> -.loop:
> -    psubw     m0, m10, m7
> -    movu     m10, [r1+16]
> -    psubw     m1, m10, m6
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    psubw     m2, m11, m5
> -    movu     m11, [r1+r2+16]
> -    paddw     m0, m1
> -    psubw     m3, m11, m4
> -    movu      m9, [r3]
> -    ABSW      m2, m2, m1
> -    ABSW      m3, m3, m1
> -    paddw     m0, m2
> -    paddw     m0, m3
> -    paddusw   m0, m9
> -    psubusw   m1, m8, m0
> -%else
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+16]
> -    psubw     m0, m7
> -    psubw     m1, m6
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    movu      m2, [r1+r2]
> -    movu      m3, [r1+r2+16]
> -    psubw     m2, m5
> -    psubw     m3, m4
> -    paddw     m0, m1
> -    ABSW      m2, m2, m1
> -    ABSW      m3, m3, m1
> -    paddw     m0, m2
> -    paddw     m0, m3
> -    movu      m2, [r3]
> -%if mmsize==32
> -    vpbroadcastw m1, r6m
> -%else
> -    movd      m1, r6m
> -    pshuflw   m1, m1, 0
> -    punpcklqdq m1, m1
> -%endif
> -    paddusw   m0, m2
> -    psubusw   m1, m0
> -%endif ; ARCH
> -    packsswb  m1, m1
> -%if mmsize==32
> -    vpermq    m1, m1, q3120
> -    mova    [r6], xm1
> -%else
> -    movh    [r6], m1
> -%endif
> -    ADS_END mmsize/8
> -
> -cglobal pixel_ads2, 5,7,8
> -%if mmsize==32
> -    vpbroadcastw m7, [r0+0]
> -    vpbroadcastw m6, [r0+4]
> -    vpbroadcastw m5, r6m
> -%else
> -    movq      m6, [r0]
> -    movd      m5, r6m
> -    pshuflw   m7, m6, 0
> -    pshuflw   m6, m6, q2222
> -    pshuflw   m5, m5, 0
> -    punpcklqdq m7, m7
> -    punpcklqdq m6, m6
> -    punpcklqdq m5, m5
> -%endif
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+r2]
> -    psubw     m0, m7
> -    psubw     m1, m6
> -    movu      m4, [r3]
> -    ABSW      m0, m0, m2
> -    ABSW      m1, m1, m3
> -    paddw     m0, m1
> -    paddusw   m0, m4
> -    psubusw   m1, m5, m0
> -    packsswb  m1, m1
> -%if mmsize==32
> -    vpermq    m1, m1, q3120
> -    mova    [r6], xm1
> -%else
> -    movh    [r6], m1
> -%endif
> -    ADS_END mmsize/8
> -
> -cglobal pixel_ads1, 5,7,8
> -%if mmsize==32
> -    vpbroadcastw m7, [r0]
> -    vpbroadcastw m6, r6m
> -%else
> -    movd      m7, [r0]
> -    movd      m6, r6m
> -    pshuflw   m7, m7, 0
> -    pshuflw   m6, m6, 0
> -    punpcklqdq m7, m7
> -    punpcklqdq m6, m6
> -%endif
> -    ADS_START
> -.loop:
> -    movu      m0, [r1]
> -    movu      m1, [r1+mmsize]
> -    psubw     m0, m7
> -    psubw     m1, m7
> -    movu      m2, [r3]
> -    movu      m3, [r3+mmsize]
> -    ABSW      m0, m0, m4
> -    ABSW      m1, m1, m5
> -    paddusw   m0, m2
> -    paddusw   m1, m3
> -    psubusw   m4, m6, m0
> -    psubusw   m5, m6, m1
> -    packsswb  m4, m5
> -%if mmsize==32
> -    vpermq    m4, m4, q3120
> -%endif
> -    mova    [r6], m4
> -    ADS_END mmsize/4
> -%endmacro
> -
> -INIT_XMM sse2
> -ADS_XMM
> -INIT_XMM ssse3
> -ADS_XMM
> -INIT_XMM avx
> -ADS_XMM
> -INIT_YMM avx2
> -ADS_XMM
> -
> -; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
> -; {
> -;     int nmv=0, i, j;
> -;     *(uint32_t*)(masks+width) = 0;
> -;     for( i=0; i<width; i+=8 )
> -;     {
> -;         uint64_t mask = *(uint64_t*)(masks+i);
> -;         if( !mask ) continue;
> -;         for( j=0; j<8; j++ )
> -;             if( mask & (255<<j*8) )
> -;                 mvs[nmv++] = i+j;
> -;     }
> -;     return nmv;
> -; }
> -
> -%macro TEST 1
> -    mov     [r4+r0*2], r1w
> -    test    r2d, 0xff<<(%1*8)
> -    setne   r3b
> -    add     r0d, r3d
> -    inc     r1d
> -%endmacro
> -
> -INIT_MMX mmx
> -cglobal pixel_ads_mvs, 0,7,0
> -ads_mvs_mmx:
> -    ; mvs = r4
> -    ; masks = r6
> -    ; width = r5
> -    ; clear last block in case width isn't divisible by 8. (assume
> divisible by 4, so clearing 4 bytes is enough.)
> -    xor     r0d, r0d
> -    xor     r1d, r1d
> -    mov     [r6+r5], r0d
> -    jmp .loopi
> -ALIGN 16
> -.loopi0:
> -    add     r1d, 8
> -    cmp     r1d, r5d
> -    jge .end
> -.loopi:
> -    mov     r2,  [r6+r1]
> -%if ARCH_X86_64
> -    test    r2,  r2
> -%else
> -    mov     r3,  r2
> -    add    r3d, [r6+r1+4]
> -%endif
> -    jz .loopi0
> -    xor     r3d, r3d
> -    TEST 0
> -    TEST 1
> -    TEST 2
> -    TEST 3
> -%if ARCH_X86_64
> -    shr     r2,  32
> -%else
> -    mov     r2d, [r6+r1]
> -%endif
> -    TEST 0
> -    TEST 1
> -    TEST 2
> -    TEST 3
> -    cmp     r1d, r5d
> -    jl .loopi
> -.end:
> -    movifnidn eax, r0d
> -    RET
> -
> -INIT_XMM ssse3
> -cglobal pixel_ads_mvs, 0,7,0
> -ads_mvs_ssse3:
> -    mova      m3, [pw_8]
> -    mova      m4, [pw_76543210]
> -    pxor      m5, m5
> -    add       r5, r6
> -    xor      r0d, r0d ; nmv
> -    mov     [r5], r0d
> -%ifdef PIC
> -    lea       r1, [$$]
> -    %define GLOBAL +r1-$$
> -%else
> -    %define GLOBAL
> -%endif
> -.loop:
> -    movh      m0, [r6]
> -    pcmpeqb   m0, m5
> -    pmovmskb r2d, m0
> -    xor      r2d, 0xffff                         ; skipping if r2d is
> zero is slower (branch mispredictions)
> -    movzx    r3d, byte [r2+popcnt_table GLOBAL]  ; popcnt
> -    add      r2d, r2d
> -    ; shuffle counters based on mv mask
> -    pshufb    m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
> -    movu [r4+r0*2], m2
> -    add      r0d, r3d
> -    paddw     m4, m3                             ; {i*8+0, i*8+1, i*8+2,
> i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
> -    add       r6, 8
> -    cmp       r6, r5
> -    jl .loop
> -    movifnidn eax, r0d
> -    RET
> -
>  ;-----------------------------------------------------------------
>  ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
>  ;-----------------------------------------------------------------
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131202/7383be6b/attachment-0001.html>


More information about the x265-devel mailing list