[x265] [PATCH] asm: removed unused code from pixel-a.asm
Steve Borho
steve at borho.org
Mon Dec 2 08:16:55 CET 2013
On Mon, Dec 2, 2013 at 12:49 AM, <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1385966974 -19800
> # Mon Dec 02 12:19:34 2013 +0530
> # Node ID 1695371f63a6cdef5ece9d17f94b286fc17cc29e
> # Parent ace5b9ee099d1539f020e68971a27577148a4a29
> asm: removed unused code from pixel-a.asm
>
queued.
For clarity, we should also prune unused function definitions from pixel.h
- at least the functions that have been removed completely or never used by
x265
>
> diff -r ace5b9ee099d -r 1695371f63a6 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Mon Dec 02 11:25:00 2013 +0530
> +++ b/source/common/x86/pixel-a.asm Mon Dec 02 12:19:34 2013 +0530
> @@ -2242,182 +2242,6 @@
> movd eax, m7
> RET
>
>
> -;-----------------------------------------------------------------------------
> -; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t
> *pixuv2, intptr_t stride2,
> -; int width, int height, uint64_t *ssd_u,
> uint64_t *ssd_v )
> -;
> -; The maximum width this function can handle without risk of overflow is
> given
> -; in the following equation: (mmsize in bits)
> -;
> -; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
> -;
> -; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
> -; distortion levels it will take much more than that though.
>
> -;-----------------------------------------------------------------------------
> -%if HIGH_BIT_DEPTH
> -%macro SSD_NV12 0
> -cglobal pixel_ssd_nv12_core, 6,7,7
> - shl r4d, 2
> - FIX_STRIDES r1, r3
> - add r0, r4
> - add r2, r4
> - xor r6, r6
> - pxor m4, m4
> - pxor m5, m5
> - pxor m6, m6
> -.loopy:
> - mov r6, r4
> - neg r6
> - pxor m2, m2
> - pxor m3, m3
> -.loopx:
> - mova m0, [r0+r6]
> - mova m1, [r0+r6+mmsize]
> - psubw m0, [r2+r6]
> - psubw m1, [r2+r6+mmsize]
> - PSHUFLW m0, m0, q3120
> - PSHUFLW m1, m1, q3120
> -%if mmsize >= 16
> - pshufhw m0, m0, q3120
> - pshufhw m1, m1, q3120
> -%endif
> - pmaddwd m0, m0
> - pmaddwd m1, m1
> - paddd m2, m0
> - paddd m3, m1
> - add r6, 2*mmsize
> - jl .loopx
> -%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
> - jz .no_overread
> - psubd m3, m1
> -.no_overread:
> -%endif
> -%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
> - ; equation above, putting the width limit at 8208
> - punpckhdq m0, m2, m6
> - punpckhdq m1, m3, m6
> - punpckldq m2, m6
> - punpckldq m3, m6
> - paddq m3, m2
> - paddq m1, m0
> - paddq m4, m3
> - paddq m4, m1
> -%else ; unfortunately paddq is sse2
> - ; emulate 48 bit precision for mmx2 instead
> - mova m0, m2
> - mova m1, m3
> - punpcklwd m2, m6
> - punpcklwd m3, m6
> - punpckhwd m0, m6
> - punpckhwd m1, m6
> - paddd m3, m2
> - paddd m1, m0
> - paddd m4, m3
> - paddd m5, m1
> -%endif
> - add r0, r1
> - add r2, r3
> - dec r5d
> - jg .loopy
> - mov r3, r6m
> - mov r4, r7m
> -%if mmsize == 32
> - vextracti128 xm0, m4, 1
> - paddq xm4, xm0
> -%endif
> -%if mmsize >= 16
> - movq [r3], xm4
> - movhps [r4], xm4
> -%else ; fixup for mmx2
> - SBUTTERFLY dq, 4, 5, 0
> - mova m0, m4
> - psrld m4, 16
> - paddd m5, m4
> - pslld m0, 16
> - SBUTTERFLY dq, 0, 5, 4
> - psrlq m0, 16
> - psrlq m5, 16
> - movq [r3], m0
> - movq [r4], m5
> -%endif
> - RET
> -%endmacro ; SSD_NV12
> -%endif ; HIGH_BIT_DEPTH
> -
> -%if HIGH_BIT_DEPTH == 0
>
> -;-----------------------------------------------------------------------------
> -; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t
> *pixuv2, intptr_t stride2,
> -; int width, int height, uint64_t *ssd_u,
> uint64_t *ssd_v )
> -;
> -; This implementation can potentially overflow on image widths >= 11008
> (or
> -; 6604 if interlaced), since it is called on blocks of height up to 12
> (resp
> -; 20). At sane distortion levels it will take much more than that though.
>
> -;-----------------------------------------------------------------------------
> -%macro SSD_NV12 0
> -cglobal pixel_ssd_nv12_core, 6,7
> - add r4d, r4d
> - add r0, r4
> - add r2, r4
> - pxor m3, m3
> - pxor m4, m4
> - mova m5, [pw_00ff]
> -.loopy:
> - mov r6, r4
> - neg r6
> -.loopx:
> -%if mmsize == 32 ; only 16-byte alignment is guaranteed
> - movu m2, [r0+r6]
> - movu m1, [r2+r6]
> -%else
> - mova m2, [r0+r6]
> - mova m1, [r2+r6]
> -%endif
> - psubusb m0, m2, m1
> - psubusb m1, m2
> - por m0, m1
> - psrlw m2, m0, 8
> - pand m0, m5
> - pmaddwd m2, m2
> - pmaddwd m0, m0
> - paddd m3, m0
> - paddd m4, m2
> - add r6, mmsize
> - jl .loopx
> -%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
> - jz .no_overread
> - pcmpeqb xm1, xm1
> - pandn m0, m1, m0 ; zero the lower half
> - pandn m2, m1, m2
> - psubd m3, m0
> - psubd m4, m2
> -.no_overread:
> -%endif
> - add r0, r1
> - add r2, r3
> - dec r5d
> - jg .loopy
> - mov r3, r6m
> - mov r4, r7m
> - HADDD m3, m0
> - HADDD m4, m0
> - pxor xm0, xm0
> - punpckldq xm3, xm0
> - punpckldq xm4, xm0
> - movq [r3], xm3
> - movq [r4], xm4
> - RET
> -%endmacro ; SSD_NV12
> -%endif ; !HIGH_BIT_DEPTH
> -
> -INIT_MMX mmx2
> -SSD_NV12
> -INIT_XMM sse2
> -SSD_NV12
> -INIT_XMM avx
> -SSD_NV12
> -INIT_YMM avx2
> -SSD_NV12
> -
>
> ;=============================================================================
> ; variance
>
> ;=============================================================================
> @@ -2841,183 +2665,6 @@
> RET
> %endmacro
>
>
> -;-----------------------------------------------------------------------------
> -; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
>
> -;-----------------------------------------------------------------------------
> -%macro VAR2_8x8_MMX 2
> -cglobal pixel_var2_8x%1, 5,6
> - FIX_STRIDES r1, r3
> - VAR_START 0
> - mov r5d, %1
> -.loop:
> -%if HIGH_BIT_DEPTH
> - mova m0, [r0]
> - mova m1, [r0+mmsize]
> - psubw m0, [r2]
> - psubw m1, [r2+mmsize]
> -%else ; !HIGH_BIT_DEPTH
> - movq m0, [r0]
> - movq m1, m0
> - movq m2, [r2]
> - movq m3, m2
> - punpcklbw m0, m7
> - punpckhbw m1, m7
> - punpcklbw m2, m7
> - punpckhbw m3, m7
> - psubw m0, m2
> - psubw m1, m3
> -%endif ; HIGH_BIT_DEPTH
> - paddw m5, m0
> - paddw m5, m1
> - pmaddwd m0, m0
> - pmaddwd m1, m1
> - paddd m6, m0
> - paddd m6, m1
> - add r0, r1
> - add r2, r3
> - dec r5d
> - jg .loop
> - VAR2_END %2, m5, m6
> -%endmacro
> -
> -%if ARCH_X86_64 == 0
> -INIT_MMX mmx2
> -VAR2_8x8_MMX 8, 6
> -VAR2_8x8_MMX 16, 7
> -%endif
> -
> -%macro VAR2_8x8_SSE2 2
> -cglobal pixel_var2_8x%1, 5,6,8
> - VAR_START 1
> - mov r5d, %1/2
> -.loop:
> -%if HIGH_BIT_DEPTH
> - mova m0, [r0]
> - mova m1, [r0+r1*2]
> - mova m2, [r2]
> - mova m3, [r2+r3*2]
> -%else ; !HIGH_BIT_DEPTH
> - movq m1, [r0]
> - movhps m1, [r0+r1]
> - movq m3, [r2]
> - movhps m3, [r2+r3]
> - DEINTB 0, 1, 2, 3, 7
> -%endif ; HIGH_BIT_DEPTH
> - psubw m0, m2
> - psubw m1, m3
> - paddw m5, m0
> - paddw m5, m1
> - pmaddwd m0, m0
> - pmaddwd m1, m1
> - paddd m6, m0
> - paddd m6, m1
> - lea r0, [r0+r1*2*SIZEOF_PIXEL]
> - lea r2, [r2+r3*2*SIZEOF_PIXEL]
> - dec r5d
> - jg .loop
> - VAR2_END %2, m5, m6
> -%endmacro
> -
> -INIT_XMM sse2
> -VAR2_8x8_SSE2 8, 6
> -VAR2_8x8_SSE2 16, 7
> -
> -%if HIGH_BIT_DEPTH == 0
> -%macro VAR2_8x8_SSSE3 2
> -cglobal pixel_var2_8x%1, 5,6,8
> - pxor m5, m5 ; sum
> - pxor m6, m6 ; sum squared
> - mova m7, [hsub_mul]
> - mov r5d, %1/4
> -.loop:
> - movq m0, [r0]
> - movq m2, [r2]
> - movq m1, [r0+r1]
> - movq m3, [r2+r3]
> - lea r0, [r0+r1*2]
> - lea r2, [r2+r3*2]
> - punpcklbw m0, m2
> - punpcklbw m1, m3
> - movq m2, [r0]
> - movq m3, [r2]
> - punpcklbw m2, m3
> - movq m3, [r0+r1]
> - movq m4, [r2+r3]
> - punpcklbw m3, m4
> - pmaddubsw m0, m7
> - pmaddubsw m1, m7
> - pmaddubsw m2, m7
> - pmaddubsw m3, m7
> - paddw m5, m0
> - paddw m5, m1
> - paddw m5, m2
> - paddw m5, m3
> - pmaddwd m0, m0
> - pmaddwd m1, m1
> - pmaddwd m2, m2
> - pmaddwd m3, m3
> - paddd m6, m0
> - paddd m6, m1
> - paddd m6, m2
> - paddd m6, m3
> - lea r0, [r0+r1*2]
> - lea r2, [r2+r3*2]
> - dec r5d
> - jg .loop
> - VAR2_END %2, m5, m6
> -%endmacro
> -
> -INIT_XMM ssse3
> -VAR2_8x8_SSSE3 8, 6
> -VAR2_8x8_SSSE3 16, 7
> -INIT_XMM xop
> -VAR2_8x8_SSSE3 8, 6
> -VAR2_8x8_SSSE3 16, 7
> -
> -%macro VAR2_8x8_AVX2 2
> -cglobal pixel_var2_8x%1, 5,6,6
> - pxor m3, m3 ; sum
> - pxor m4, m4 ; sum squared
> - mova m5, [hsub_mul]
> - mov r5d, %1/4
> -.loop:
> - movq xm0, [r0]
> - movq xm1, [r2]
> - vinserti128 m0, m0, [r0+r1], 1
> - vinserti128 m1, m1, [r2+r3], 1
> - lea r0, [r0+r1*2]
> - lea r2, [r2+r3*2]
> - punpcklbw m0, m1
> - movq xm1, [r0]
> - movq xm2, [r2]
> - vinserti128 m1, m1, [r0+r1], 1
> - vinserti128 m2, m2, [r2+r3], 1
> - lea r0, [r0+r1*2]
> - lea r2, [r2+r3*2]
> - punpcklbw m1, m2
> - pmaddubsw m0, m5
> - pmaddubsw m1, m5
> - paddw m3, m0
> - paddw m3, m1
> - pmaddwd m0, m0
> - pmaddwd m1, m1
> - paddd m4, m0
> - paddd m4, m1
> - dec r5d
> - jg .loop
> - vextracti128 xm0, m3, 1
> - vextracti128 xm1, m4, 1
> - paddw xm3, xm0
> - paddd xm4, xm1
> - VAR2_END %2, xm3, xm4
> -%endmacro
> -
> -INIT_YMM avx2
> -VAR2_8x8_AVX2 8, 6
> -VAR2_8x8_AVX2 16, 7
> -
> -%endif ; !HIGH_BIT_DEPTH
> -
>
> ;=============================================================================
> ; SATD
>
> ;=============================================================================
> @@ -9009,446 +8656,6 @@
> INIT_XMM avx
> SSIM
>
>
> -;-----------------------------------------------------------------------------
> -; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t
> stride2, int height );
>
> -;-----------------------------------------------------------------------------
> -%macro ASD8 0
> -cglobal pixel_asd8, 5,5
> - pxor m0, m0
> - pxor m1, m1
> -.loop:
> -%if HIGH_BIT_DEPTH
> - paddw m0, [r0]
> - paddw m1, [r2]
> - paddw m0, [r0+2*r1]
> - paddw m1, [r2+2*r3]
> - lea r0, [r0+4*r1]
> - paddw m0, [r0]
> - paddw m1, [r2+4*r3]
> - lea r2, [r2+4*r3]
> - paddw m0, [r0+2*r1]
> - paddw m1, [r2+2*r3]
> - lea r0, [r0+4*r1]
> - lea r2, [r2+4*r3]
> -%else
> - movq m2, [r0]
> - movq m3, [r2]
> - movhps m2, [r0+r1]
> - movhps m3, [r2+r3]
> - lea r0, [r0+2*r1]
> - psadbw m2, m1
> - psadbw m3, m1
> - movq m4, [r0]
> - movq m5, [r2+2*r3]
> - lea r2, [r2+2*r3]
> - movhps m4, [r0+r1]
> - movhps m5, [r2+r3]
> - lea r0, [r0+2*r1]
> - paddw m0, m2
> - psubw m0, m3
> - psadbw m4, m1
> - psadbw m5, m1
> - lea r2, [r2+2*r3]
> - paddw m0, m4
> - psubw m0, m5
> -%endif
> - sub r4d, 4
> - jg .loop
> -%if HIGH_BIT_DEPTH
> - psubw m0, m1
> - HADDW m0, m1
> - ABSD m1, m0
> -%else
> - movhlps m1, m0
> - paddw m0, m1
> - ABSW m1, m0
> -%endif
> - movd eax, m1
> - RET
> -%endmacro
> -
> -INIT_XMM sse2
> -ASD8
> -INIT_XMM ssse3
> -ASD8
> -%if HIGH_BIT_DEPTH
> -INIT_XMM xop
> -ASD8
> -%endif
> -
>
> -;=============================================================================
> -; Successive Elimination ADS
>
> -;=============================================================================
> -
> -%macro ADS_START 0
> -%if UNIX64
> - movsxd r5, r5d
> -%else
> - mov r5d, r5m
> -%endif
> - mov r0d, r5d
> - lea r6, [r4+r5+(mmsize-1)]
> - and r6, ~(mmsize-1)
> - shl r2d, 1
> -%endmacro
> -
> -%macro ADS_END 1 ; unroll_size
> - add r1, 8*%1
> - add r3, 8*%1
> - add r6, 4*%1
> - sub r0d, 4*%1
> - jg .loop
> - WIN64_RESTORE_XMM rsp
> -%if mmsize==32
> - vzeroupper
> -%endif
> - lea r6, [r4+r5+(mmsize-1)]
> - and r6, ~(mmsize-1)
> -%if cpuflag(ssse3)
> - jmp ads_mvs_ssse3
> -%else
> - jmp ads_mvs_mmx
> -%endif
> -%endmacro
> -
>
> -;-----------------------------------------------------------------------------
> -; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
> -; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh
> )
>
> -;-----------------------------------------------------------------------------
> -INIT_MMX mmx2
> -cglobal pixel_ads4, 5,7
> - mova m6, [r0]
> - mova m4, [r0+8]
> - pshufw m7, m6, 0
> - pshufw m6, m6, q2222
> - pshufw m5, m4, 0
> - pshufw m4, m4, q2222
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+16]
> - psubw m0, m7
> - psubw m1, m6
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - movu m2, [r1+r2]
> - movu m3, [r1+r2+16]
> - psubw m2, m5
> - psubw m3, m4
> - paddw m0, m1
> - ABSW m2, m2, m1
> - ABSW m3, m3, m1
> - paddw m0, m2
> - paddw m0, m3
> - pshufw m1, r6m, 0
> - paddusw m0, [r3]
> - psubusw m1, m0
> - packsswb m1, m1
> - movd [r6], m1
> - ADS_END 1
> -
> -cglobal pixel_ads2, 5,7
> - mova m6, [r0]
> - pshufw m5, r6m, 0
> - pshufw m7, m6, 0
> - pshufw m6, m6, q2222
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+r2]
> - psubw m0, m7
> - psubw m1, m6
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - paddw m0, m1
> - paddusw m0, [r3]
> - mova m4, m5
> - psubusw m4, m0
> - packsswb m4, m4
> - movd [r6], m4
> - ADS_END 1
> -
> -cglobal pixel_ads1, 5,7
> - pshufw m7, [r0], 0
> - pshufw m6, r6m, 0
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+8]
> - psubw m0, m7
> - psubw m1, m7
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - paddusw m0, [r3]
> - paddusw m1, [r3+8]
> - mova m4, m6
> - mova m5, m6
> - psubusw m4, m0
> - psubusw m5, m1
> - packsswb m4, m5
> - mova [r6], m4
> - ADS_END 2
> -
> -%macro ADS_XMM 0
> -%if mmsize==32
> -cglobal pixel_ads4, 5,7,8
> - vpbroadcastw m7, [r0+ 0]
> - vpbroadcastw m6, [r0+ 4]
> - vpbroadcastw m5, [r0+ 8]
> - vpbroadcastw m4, [r0+12]
> -%else
> -cglobal pixel_ads4, 5,7,12
> - mova m4, [r0]
> - pshuflw m7, m4, q0000
> - pshuflw m6, m4, q2222
> - pshufhw m5, m4, q0000
> - pshufhw m4, m4, q2222
> - punpcklqdq m7, m7
> - punpcklqdq m6, m6
> - punpckhqdq m5, m5
> - punpckhqdq m4, m4
> -%endif
> -%if ARCH_X86_64 && mmsize == 16
> - movd m8, r6m
> - SPLATW m8, m8
> - ADS_START
> - movu m10, [r1]
> - movu m11, [r1+r2]
> -.loop:
> - psubw m0, m10, m7
> - movu m10, [r1+16]
> - psubw m1, m10, m6
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - psubw m2, m11, m5
> - movu m11, [r1+r2+16]
> - paddw m0, m1
> - psubw m3, m11, m4
> - movu m9, [r3]
> - ABSW m2, m2, m1
> - ABSW m3, m3, m1
> - paddw m0, m2
> - paddw m0, m3
> - paddusw m0, m9
> - psubusw m1, m8, m0
> -%else
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+16]
> - psubw m0, m7
> - psubw m1, m6
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - movu m2, [r1+r2]
> - movu m3, [r1+r2+16]
> - psubw m2, m5
> - psubw m3, m4
> - paddw m0, m1
> - ABSW m2, m2, m1
> - ABSW m3, m3, m1
> - paddw m0, m2
> - paddw m0, m3
> - movu m2, [r3]
> -%if mmsize==32
> - vpbroadcastw m1, r6m
> -%else
> - movd m1, r6m
> - pshuflw m1, m1, 0
> - punpcklqdq m1, m1
> -%endif
> - paddusw m0, m2
> - psubusw m1, m0
> -%endif ; ARCH
> - packsswb m1, m1
> -%if mmsize==32
> - vpermq m1, m1, q3120
> - mova [r6], xm1
> -%else
> - movh [r6], m1
> -%endif
> - ADS_END mmsize/8
> -
> -cglobal pixel_ads2, 5,7,8
> -%if mmsize==32
> - vpbroadcastw m7, [r0+0]
> - vpbroadcastw m6, [r0+4]
> - vpbroadcastw m5, r6m
> -%else
> - movq m6, [r0]
> - movd m5, r6m
> - pshuflw m7, m6, 0
> - pshuflw m6, m6, q2222
> - pshuflw m5, m5, 0
> - punpcklqdq m7, m7
> - punpcklqdq m6, m6
> - punpcklqdq m5, m5
> -%endif
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+r2]
> - psubw m0, m7
> - psubw m1, m6
> - movu m4, [r3]
> - ABSW m0, m0, m2
> - ABSW m1, m1, m3
> - paddw m0, m1
> - paddusw m0, m4
> - psubusw m1, m5, m0
> - packsswb m1, m1
> -%if mmsize==32
> - vpermq m1, m1, q3120
> - mova [r6], xm1
> -%else
> - movh [r6], m1
> -%endif
> - ADS_END mmsize/8
> -
> -cglobal pixel_ads1, 5,7,8
> -%if mmsize==32
> - vpbroadcastw m7, [r0]
> - vpbroadcastw m6, r6m
> -%else
> - movd m7, [r0]
> - movd m6, r6m
> - pshuflw m7, m7, 0
> - pshuflw m6, m6, 0
> - punpcklqdq m7, m7
> - punpcklqdq m6, m6
> -%endif
> - ADS_START
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+mmsize]
> - psubw m0, m7
> - psubw m1, m7
> - movu m2, [r3]
> - movu m3, [r3+mmsize]
> - ABSW m0, m0, m4
> - ABSW m1, m1, m5
> - paddusw m0, m2
> - paddusw m1, m3
> - psubusw m4, m6, m0
> - psubusw m5, m6, m1
> - packsswb m4, m5
> -%if mmsize==32
> - vpermq m4, m4, q3120
> -%endif
> - mova [r6], m4
> - ADS_END mmsize/4
> -%endmacro
> -
> -INIT_XMM sse2
> -ADS_XMM
> -INIT_XMM ssse3
> -ADS_XMM
> -INIT_XMM avx
> -ADS_XMM
> -INIT_YMM avx2
> -ADS_XMM
> -
> -; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
> -; {
> -; int nmv=0, i, j;
> -; *(uint32_t*)(masks+width) = 0;
> -; for( i=0; i<width; i+=8 )
> -; {
> -; uint64_t mask = *(uint64_t*)(masks+i);
> -; if( !mask ) continue;
> -; for( j=0; j<8; j++ )
> -; if( mask & (255<<j*8) )
> -; mvs[nmv++] = i+j;
> -; }
> -; return nmv;
> -; }
> -
> -%macro TEST 1
> - mov [r4+r0*2], r1w
> - test r2d, 0xff<<(%1*8)
> - setne r3b
> - add r0d, r3d
> - inc r1d
> -%endmacro
> -
> -INIT_MMX mmx
> -cglobal pixel_ads_mvs, 0,7,0
> -ads_mvs_mmx:
> - ; mvs = r4
> - ; masks = r6
> - ; width = r5
> - ; clear last block in case width isn't divisible by 8. (assume
> divisible by 4, so clearing 4 bytes is enough.)
> - xor r0d, r0d
> - xor r1d, r1d
> - mov [r6+r5], r0d
> - jmp .loopi
> -ALIGN 16
> -.loopi0:
> - add r1d, 8
> - cmp r1d, r5d
> - jge .end
> -.loopi:
> - mov r2, [r6+r1]
> -%if ARCH_X86_64
> - test r2, r2
> -%else
> - mov r3, r2
> - add r3d, [r6+r1+4]
> -%endif
> - jz .loopi0
> - xor r3d, r3d
> - TEST 0
> - TEST 1
> - TEST 2
> - TEST 3
> -%if ARCH_X86_64
> - shr r2, 32
> -%else
> - mov r2d, [r6+r1]
> -%endif
> - TEST 0
> - TEST 1
> - TEST 2
> - TEST 3
> - cmp r1d, r5d
> - jl .loopi
> -.end:
> - movifnidn eax, r0d
> - RET
> -
> -INIT_XMM ssse3
> -cglobal pixel_ads_mvs, 0,7,0
> -ads_mvs_ssse3:
> - mova m3, [pw_8]
> - mova m4, [pw_76543210]
> - pxor m5, m5
> - add r5, r6
> - xor r0d, r0d ; nmv
> - mov [r5], r0d
> -%ifdef PIC
> - lea r1, [$$]
> - %define GLOBAL +r1-$$
> -%else
> - %define GLOBAL
> -%endif
> -.loop:
> - movh m0, [r6]
> - pcmpeqb m0, m5
> - pmovmskb r2d, m0
> - xor r2d, 0xffff ; skipping if r2d is
> zero is slower (branch mispredictions)
> - movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
> - add r2d, r2d
> - ; shuffle counters based on mv mask
> - pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
> - movu [r4+r0*2], m2
> - add r0d, r3d
> - paddw m4, m3 ; {i*8+0, i*8+1, i*8+2,
> i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
> - add r6, 8
> - cmp r6, r5
> - jl .loop
> - movifnidn eax, r0d
> - RET
> -
> ;-----------------------------------------------------------------
> ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131202/7383be6b/attachment-0001.html>
More information about the x265-devel
mailing list