[x265] [PATCH] asm: removed unused code from pixel-a.asm

murugan at multicorewareinc.com murugan at multicorewareinc.com
Mon Dec 2 07:49:53 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385966974 -19800
#      Mon Dec 02 12:19:34 2013 +0530
# Node ID 1695371f63a6cdef5ece9d17f94b286fc17cc29e
# Parent  ace5b9ee099d1539f020e68971a27577148a4a29
asm: removed unused code from pixel-a.asm

diff -r ace5b9ee099d -r 1695371f63a6 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Dec 02 11:25:00 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Dec 02 12:19:34 2013 +0530
@@ -2242,182 +2242,6 @@
     movd     eax,    m7
     RET
 
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
-;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; The maximum width this function can handle without risk of overflow is given
-; in the following equation: (mmsize in bits)
-;
-;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
-;
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
-; distortion levels it will take much more than that though.
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7,7
-    shl        r4d, 2
-    FIX_STRIDES r1, r3
-    add         r0, r4
-    add         r2, r4
-    xor         r6, r6
-    pxor        m4, m4
-    pxor        m5, m5
-    pxor        m6, m6
-.loopy:
-    mov         r6, r4
-    neg         r6
-    pxor        m2, m2
-    pxor        m3, m3
-.loopx:
-    mova        m0, [r0+r6]
-    mova        m1, [r0+r6+mmsize]
-    psubw       m0, [r2+r6]
-    psubw       m1, [r2+r6+mmsize]
-    PSHUFLW     m0, m0, q3120
-    PSHUFLW     m1, m1, q3120
-%if mmsize >= 16
-    pshufhw     m0, m0, q3120
-    pshufhw     m1, m1, q3120
-%endif
-    pmaddwd     m0, m0
-    pmaddwd     m1, m1
-    paddd       m2, m0
-    paddd       m3, m1
-    add         r6, 2*mmsize
-    jl .loopx
-%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
-    jz .no_overread
-    psubd       m3, m1
-.no_overread:
-%endif
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
-                 ; equation above, putting the width limit at 8208
-    punpckhdq   m0, m2, m6
-    punpckhdq   m1, m3, m6
-    punpckldq   m2, m6
-    punpckldq   m3, m6
-    paddq       m3, m2
-    paddq       m1, m0
-    paddq       m4, m3
-    paddq       m4, m1
-%else ; unfortunately paddq is sse2
-      ; emulate 48 bit precision for mmx2 instead
-    mova        m0, m2
-    mova        m1, m3
-    punpcklwd   m2, m6
-    punpcklwd   m3, m6
-    punpckhwd   m0, m6
-    punpckhwd   m1, m6
-    paddd       m3, m2
-    paddd       m1, m0
-    paddd       m4, m3
-    paddd       m5, m1
-%endif
-    add         r0, r1
-    add         r2, r3
-    dec        r5d
-    jg .loopy
-    mov         r3, r6m
-    mov         r4, r7m
-%if mmsize == 32
-    vextracti128 xm0, m4, 1
-    paddq      xm4, xm0
-%endif
-%if mmsize >= 16
-    movq      [r3], xm4
-    movhps    [r4], xm4
-%else ; fixup for mmx2
-    SBUTTERFLY dq, 4, 5, 0
-    mova        m0, m4
-    psrld       m4, 16
-    paddd       m5, m4
-    pslld       m0, 16
-    SBUTTERFLY dq, 0, 5, 4
-    psrlq       m0, 16
-    psrlq       m5, 16
-    movq      [r3], m0
-    movq      [r4], m5
-%endif
-    RET
-%endmacro ; SSD_NV12
-%endif ; HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH == 0
-;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
-;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
-;
-; This implementation can potentially overflow on image widths >= 11008 (or
-; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
-; 20). At sane distortion levels it will take much more than that though.
-;-----------------------------------------------------------------------------
-%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7
-    add    r4d, r4d
-    add     r0, r4
-    add     r2, r4
-    pxor    m3, m3
-    pxor    m4, m4
-    mova    m5, [pw_00ff]
-.loopy:
-    mov     r6, r4
-    neg     r6
-.loopx:
-%if mmsize == 32 ; only 16-byte alignment is guaranteed
-    movu    m2, [r0+r6]
-    movu    m1, [r2+r6]
-%else
-    mova    m2, [r0+r6]
-    mova    m1, [r2+r6]
-%endif
-    psubusb m0, m2, m1
-    psubusb m1, m2
-    por     m0, m1
-    psrlw   m2, m0, 8
-    pand    m0, m5
-    pmaddwd m2, m2
-    pmaddwd m0, m0
-    paddd   m3, m0
-    paddd   m4, m2
-    add     r6, mmsize
-    jl .loopx
-%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
-    jz .no_overread
-    pcmpeqb xm1, xm1
-    pandn   m0, m1, m0 ; zero the lower half
-    pandn   m2, m1, m2
-    psubd   m3, m0
-    psubd   m4, m2
-.no_overread:
-%endif
-    add     r0, r1
-    add     r2, r3
-    dec    r5d
-    jg .loopy
-    mov     r3, r6m
-    mov     r4, r7m
-    HADDD   m3, m0
-    HADDD   m4, m0
-    pxor   xm0, xm0
-    punpckldq xm3, xm0
-    punpckldq xm4, xm0
-    movq  [r3], xm3
-    movq  [r4], xm4
-    RET
-%endmacro ; SSD_NV12
-%endif ; !HIGH_BIT_DEPTH
-
-INIT_MMX mmx2
-SSD_NV12
-INIT_XMM sse2
-SSD_NV12
-INIT_XMM avx
-SSD_NV12
-INIT_YMM avx2
-SSD_NV12
-
 ;=============================================================================
 ; variance
 ;=============================================================================
@@ -2841,183 +2665,6 @@
     RET
 %endmacro
 
-;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
-;-----------------------------------------------------------------------------
-%macro VAR2_8x8_MMX 2
-cglobal pixel_var2_8x%1, 5,6
-    FIX_STRIDES r1, r3
-    VAR_START 0
-    mov      r5d, %1
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+mmsize]
-    psubw     m0, [r2]
-    psubw     m1, [r2+mmsize]
-%else ; !HIGH_BIT_DEPTH
-    movq      m0, [r0]
-    movq      m1, m0
-    movq      m2, [r2]
-    movq      m3, m2
-    punpcklbw m0, m7
-    punpckhbw m1, m7
-    punpcklbw m2, m7
-    punpckhbw m3, m7
-    psubw     m0, m2
-    psubw     m1, m3
-%endif ; HIGH_BIT_DEPTH
-    paddw     m5, m0
-    paddw     m5, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m6, m0
-    paddd     m6, m1
-    add       r0, r1
-    add       r2, r3
-    dec       r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-VAR2_8x8_MMX  8, 6
-VAR2_8x8_MMX 16, 7
-%endif
-
-%macro VAR2_8x8_SSE2 2
-cglobal pixel_var2_8x%1, 5,6,8
-    VAR_START 1
-    mov      r5d, %1/2
-.loop:
-%if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m2, [r2]
-    mova      m3, [r2+r3*2]
-%else ; !HIGH_BIT_DEPTH
-    movq      m1, [r0]
-    movhps    m1, [r0+r1]
-    movq      m3, [r2]
-    movhps    m3, [r2+r3]
-    DEINTB    0, 1, 2, 3, 7
-%endif ; HIGH_BIT_DEPTH
-    psubw     m0, m2
-    psubw     m1, m3
-    paddw     m5, m0
-    paddw     m5, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m6, m0
-    paddd     m6, m1
-    lea       r0, [r0+r1*2*SIZEOF_PIXEL]
-    lea       r2, [r2+r3*2*SIZEOF_PIXEL]
-    dec      r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM sse2
-VAR2_8x8_SSE2  8, 6
-VAR2_8x8_SSE2 16, 7
-
-%if HIGH_BIT_DEPTH == 0
-%macro VAR2_8x8_SSSE3 2
-cglobal pixel_var2_8x%1, 5,6,8
-    pxor      m5, m5    ; sum
-    pxor      m6, m6    ; sum squared
-    mova      m7, [hsub_mul]
-    mov      r5d, %1/4
-.loop:
-    movq      m0, [r0]
-    movq      m2, [r2]
-    movq      m1, [r0+r1]
-    movq      m3, [r2+r3]
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m0, m2
-    punpcklbw m1, m3
-    movq      m2, [r0]
-    movq      m3, [r2]
-    punpcklbw m2, m3
-    movq      m3, [r0+r1]
-    movq      m4, [r2+r3]
-    punpcklbw m3, m4
-    pmaddubsw m0, m7
-    pmaddubsw m1, m7
-    pmaddubsw m2, m7
-    pmaddubsw m3, m7
-    paddw     m5, m0
-    paddw     m5, m1
-    paddw     m5, m2
-    paddw     m5, m3
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    pmaddwd   m2, m2
-    pmaddwd   m3, m3
-    paddd     m6, m0
-    paddd     m6, m1
-    paddd     m6, m2
-    paddd     m6, m3
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    dec      r5d
-    jg .loop
-    VAR2_END %2, m5, m6
-%endmacro
-
-INIT_XMM ssse3
-VAR2_8x8_SSSE3  8, 6
-VAR2_8x8_SSSE3 16, 7
-INIT_XMM xop
-VAR2_8x8_SSSE3  8, 6
-VAR2_8x8_SSSE3 16, 7
-
-%macro VAR2_8x8_AVX2 2
-cglobal pixel_var2_8x%1, 5,6,6
-    pxor      m3, m3    ; sum
-    pxor      m4, m4    ; sum squared
-    mova      m5, [hsub_mul]
-    mov      r5d, %1/4
-.loop:
-    movq     xm0, [r0]
-    movq     xm1, [r2]
-    vinserti128 m0, m0, [r0+r1], 1
-    vinserti128 m1, m1, [r2+r3], 1
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m0, m1
-    movq     xm1, [r0]
-    movq     xm2, [r2]
-    vinserti128 m1, m1, [r0+r1], 1
-    vinserti128 m2, m2, [r2+r3], 1
-    lea       r0, [r0+r1*2]
-    lea       r2, [r2+r3*2]
-    punpcklbw m1, m2
-    pmaddubsw m0, m5
-    pmaddubsw m1, m5
-    paddw     m3, m0
-    paddw     m3, m1
-    pmaddwd   m0, m0
-    pmaddwd   m1, m1
-    paddd     m4, m0
-    paddd     m4, m1
-    dec      r5d
-    jg .loop
-    vextracti128 xm0, m3, 1
-    vextracti128 xm1, m4, 1
-    paddw    xm3, xm0
-    paddd    xm4, xm1
-    VAR2_END %2, xm3, xm4
-%endmacro
-
-INIT_YMM avx2
-VAR2_8x8_AVX2  8, 6
-VAR2_8x8_AVX2 16, 7
-
-%endif ; !HIGH_BIT_DEPTH
-
 ;=============================================================================
 ; SATD
 ;=============================================================================
@@ -9009,446 +8656,6 @@
 INIT_XMM avx
 SSIM
 
-;-----------------------------------------------------------------------------
-; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-;-----------------------------------------------------------------------------
-%macro ASD8 0
-cglobal pixel_asd8, 5,5
-    pxor     m0, m0
-    pxor     m1, m1
-.loop:
-%if HIGH_BIT_DEPTH
-    paddw    m0, [r0]
-    paddw    m1, [r2]
-    paddw    m0, [r0+2*r1]
-    paddw    m1, [r2+2*r3]
-    lea      r0, [r0+4*r1]
-    paddw    m0, [r0]
-    paddw    m1, [r2+4*r3]
-    lea      r2, [r2+4*r3]
-    paddw    m0, [r0+2*r1]
-    paddw    m1, [r2+2*r3]
-    lea      r0, [r0+4*r1]
-    lea      r2, [r2+4*r3]
-%else
-    movq     m2, [r0]
-    movq     m3, [r2]
-    movhps   m2, [r0+r1]
-    movhps   m3, [r2+r3]
-    lea      r0, [r0+2*r1]
-    psadbw   m2, m1
-    psadbw   m3, m1
-    movq     m4, [r0]
-    movq     m5, [r2+2*r3]
-    lea      r2, [r2+2*r3]
-    movhps   m4, [r0+r1]
-    movhps   m5, [r2+r3]
-    lea      r0, [r0+2*r1]
-    paddw    m0, m2
-    psubw    m0, m3
-    psadbw   m4, m1
-    psadbw   m5, m1
-    lea      r2, [r2+2*r3]
-    paddw    m0, m4
-    psubw    m0, m5
-%endif
-    sub     r4d, 4
-    jg .loop
-%if HIGH_BIT_DEPTH
-    psubw    m0, m1
-    HADDW    m0, m1
-    ABSD     m1, m0
-%else
-    movhlps  m1, m0
-    paddw    m0, m1
-    ABSW     m1, m0
-%endif
-    movd    eax, m1
-    RET
-%endmacro
-
-INIT_XMM sse2
-ASD8
-INIT_XMM ssse3
-ASD8
-%if HIGH_BIT_DEPTH
-INIT_XMM xop
-ASD8
-%endif
-
-;=============================================================================
-; Successive Elimination ADS
-;=============================================================================
-
-%macro ADS_START 0
-%if UNIX64
-    movsxd  r5, r5d
-%else
-    mov    r5d, r5m
-%endif
-    mov    r0d, r5d
-    lea     r6, [r4+r5+(mmsize-1)]
-    and     r6, ~(mmsize-1)
-    shl     r2d,  1
-%endmacro
-
-%macro ADS_END 1 ; unroll_size
-    add     r1, 8*%1
-    add     r3, 8*%1
-    add     r6, 4*%1
-    sub    r0d, 4*%1
-    jg .loop
-    WIN64_RESTORE_XMM rsp
-%if mmsize==32
-    vzeroupper
-%endif
-    lea     r6, [r4+r5+(mmsize-1)]
-    and     r6, ~(mmsize-1)
-%if cpuflag(ssse3)
-    jmp ads_mvs_ssse3
-%else
-    jmp ads_mvs_mmx
-%endif
-%endmacro
-
-;-----------------------------------------------------------------------------
-; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
-;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
-;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_ads4, 5,7
-    mova    m6, [r0]
-    mova    m4, [r0+8]
-    pshufw  m7, m6, 0
-    pshufw  m6, m6, q2222
-    pshufw  m5, m4, 0
-    pshufw  m4, m4, q2222
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+16]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    movu      m2, [r1+r2]
-    movu      m3, [r1+r2+16]
-    psubw     m2, m5
-    psubw     m3, m4
-    paddw     m0, m1
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    pshufw    m1, r6m, 0
-    paddusw   m0, [r3]
-    psubusw   m1, m0
-    packsswb  m1, m1
-    movd    [r6], m1
-    ADS_END 1
-
-cglobal pixel_ads2, 5,7
-    mova      m6, [r0]
-    pshufw    m5, r6m, 0
-    pshufw    m7, m6, 0
-    pshufw    m6, m6, q2222
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+r2]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddw     m0, m1
-    paddusw   m0, [r3]
-    mova      m4, m5
-    psubusw   m4, m0
-    packsswb  m4, m4
-    movd    [r6], m4
-    ADS_END 1
-
-cglobal pixel_ads1, 5,7
-    pshufw    m7, [r0], 0
-    pshufw    m6, r6m, 0
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+8]
-    psubw     m0, m7
-    psubw     m1, m7
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddusw   m0, [r3]
-    paddusw   m1, [r3+8]
-    mova      m4, m6
-    mova      m5, m6
-    psubusw   m4, m0
-    psubusw   m5, m1
-    packsswb  m4, m5
-    mova    [r6], m4
-    ADS_END 2
-
-%macro ADS_XMM 0
-%if mmsize==32
-cglobal pixel_ads4, 5,7,8
-    vpbroadcastw m7, [r0+ 0]
-    vpbroadcastw m6, [r0+ 4]
-    vpbroadcastw m5, [r0+ 8]
-    vpbroadcastw m4, [r0+12]
-%else
-cglobal pixel_ads4, 5,7,12
-    mova      m4, [r0]
-    pshuflw   m7, m4, q0000
-    pshuflw   m6, m4, q2222
-    pshufhw   m5, m4, q0000
-    pshufhw   m4, m4, q2222
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-    punpckhqdq m5, m5
-    punpckhqdq m4, m4
-%endif
-%if ARCH_X86_64 && mmsize == 16
-    movd      m8, r6m
-    SPLATW    m8, m8
-    ADS_START
-    movu     m10, [r1]
-    movu     m11, [r1+r2]
-.loop:
-    psubw     m0, m10, m7
-    movu     m10, [r1+16]
-    psubw     m1, m10, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    psubw     m2, m11, m5
-    movu     m11, [r1+r2+16]
-    paddw     m0, m1
-    psubw     m3, m11, m4
-    movu      m9, [r3]
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    paddusw   m0, m9
-    psubusw   m1, m8, m0
-%else
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+16]
-    psubw     m0, m7
-    psubw     m1, m6
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    movu      m2, [r1+r2]
-    movu      m3, [r1+r2+16]
-    psubw     m2, m5
-    psubw     m3, m4
-    paddw     m0, m1
-    ABSW      m2, m2, m1
-    ABSW      m3, m3, m1
-    paddw     m0, m2
-    paddw     m0, m3
-    movu      m2, [r3]
-%if mmsize==32
-    vpbroadcastw m1, r6m
-%else
-    movd      m1, r6m
-    pshuflw   m1, m1, 0
-    punpcklqdq m1, m1
-%endif
-    paddusw   m0, m2
-    psubusw   m1, m0
-%endif ; ARCH
-    packsswb  m1, m1
-%if mmsize==32
-    vpermq    m1, m1, q3120
-    mova    [r6], xm1
-%else
-    movh    [r6], m1
-%endif
-    ADS_END mmsize/8
-
-cglobal pixel_ads2, 5,7,8
-%if mmsize==32
-    vpbroadcastw m7, [r0+0]
-    vpbroadcastw m6, [r0+4]
-    vpbroadcastw m5, r6m
-%else
-    movq      m6, [r0]
-    movd      m5, r6m
-    pshuflw   m7, m6, 0
-    pshuflw   m6, m6, q2222
-    pshuflw   m5, m5, 0
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-    punpcklqdq m5, m5
-%endif
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+r2]
-    psubw     m0, m7
-    psubw     m1, m6
-    movu      m4, [r3]
-    ABSW      m0, m0, m2
-    ABSW      m1, m1, m3
-    paddw     m0, m1
-    paddusw   m0, m4
-    psubusw   m1, m5, m0
-    packsswb  m1, m1
-%if mmsize==32
-    vpermq    m1, m1, q3120
-    mova    [r6], xm1
-%else
-    movh    [r6], m1
-%endif
-    ADS_END mmsize/8
-
-cglobal pixel_ads1, 5,7,8
-%if mmsize==32
-    vpbroadcastw m7, [r0]
-    vpbroadcastw m6, r6m
-%else
-    movd      m7, [r0]
-    movd      m6, r6m
-    pshuflw   m7, m7, 0
-    pshuflw   m6, m6, 0
-    punpcklqdq m7, m7
-    punpcklqdq m6, m6
-%endif
-    ADS_START
-.loop:
-    movu      m0, [r1]
-    movu      m1, [r1+mmsize]
-    psubw     m0, m7
-    psubw     m1, m7
-    movu      m2, [r3]
-    movu      m3, [r3+mmsize]
-    ABSW      m0, m0, m4
-    ABSW      m1, m1, m5
-    paddusw   m0, m2
-    paddusw   m1, m3
-    psubusw   m4, m6, m0
-    psubusw   m5, m6, m1
-    packsswb  m4, m5
-%if mmsize==32
-    vpermq    m4, m4, q3120
-%endif
-    mova    [r6], m4
-    ADS_END mmsize/4
-%endmacro
-
-INIT_XMM sse2
-ADS_XMM
-INIT_XMM ssse3
-ADS_XMM
-INIT_XMM avx
-ADS_XMM
-INIT_YMM avx2
-ADS_XMM
-
-; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
-; {
-;     int nmv=0, i, j;
-;     *(uint32_t*)(masks+width) = 0;
-;     for( i=0; i<width; i+=8 )
-;     {
-;         uint64_t mask = *(uint64_t*)(masks+i);
-;         if( !mask ) continue;
-;         for( j=0; j<8; j++ )
-;             if( mask & (255<<j*8) )
-;                 mvs[nmv++] = i+j;
-;     }
-;     return nmv;
-; }
-
-%macro TEST 1
-    mov     [r4+r0*2], r1w
-    test    r2d, 0xff<<(%1*8)
-    setne   r3b
-    add     r0d, r3d
-    inc     r1d
-%endmacro
-
-INIT_MMX mmx
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_mmx:
-    ; mvs = r4
-    ; masks = r6
-    ; width = r5
-    ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
-    xor     r0d, r0d
-    xor     r1d, r1d
-    mov     [r6+r5], r0d
-    jmp .loopi
-ALIGN 16
-.loopi0:
-    add     r1d, 8
-    cmp     r1d, r5d
-    jge .end
-.loopi:
-    mov     r2,  [r6+r1]
-%if ARCH_X86_64
-    test    r2,  r2
-%else
-    mov     r3,  r2
-    add    r3d, [r6+r1+4]
-%endif
-    jz .loopi0
-    xor     r3d, r3d
-    TEST 0
-    TEST 1
-    TEST 2
-    TEST 3
-%if ARCH_X86_64
-    shr     r2,  32
-%else
-    mov     r2d, [r6+r1]
-%endif
-    TEST 0
-    TEST 1
-    TEST 2
-    TEST 3
-    cmp     r1d, r5d
-    jl .loopi
-.end:
-    movifnidn eax, r0d
-    RET
-
-INIT_XMM ssse3
-cglobal pixel_ads_mvs, 0,7,0
-ads_mvs_ssse3:
-    mova      m3, [pw_8]
-    mova      m4, [pw_76543210]
-    pxor      m5, m5
-    add       r5, r6
-    xor      r0d, r0d ; nmv
-    mov     [r5], r0d
-%ifdef PIC
-    lea       r1, [$$]
-    %define GLOBAL +r1-$$
-%else
-    %define GLOBAL
-%endif
-.loop:
-    movh      m0, [r6]
-    pcmpeqb   m0, m5
-    pmovmskb r2d, m0
-    xor      r2d, 0xffff                         ; skipping if r2d is zero is slower (branch mispredictions)
-    movzx    r3d, byte [r2+popcnt_table GLOBAL]  ; popcnt
-    add      r2d, r2d
-    ; shuffle counters based on mv mask
-    pshufb    m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
-    movu [r4+r0*2], m2
-    add      r0d, r3d
-    paddw     m4, m3                             ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
-    add       r6, 8
-    cmp       r6, r5
-    jl .loop
-    movifnidn eax, r0d
-    RET
-
 ;-----------------------------------------------------------------
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
 ;-----------------------------------------------------------------


More information about the x265-devel mailing list