<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Mon, Dec 2, 2013 at 12:49 AM, <span dir="ltr"><<a href="mailto:murugan@multicorewareinc.com" target="_blank">murugan@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Murugan Vairavel <<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a>><br>
# Date 1385966974 -19800<br>
# Mon Dec 02 12:19:34 2013 +0530<br>
# Node ID 1695371f63a6cdef5ece9d17f94b286fc17cc29e<br>
# Parent ace5b9ee099d1539f020e68971a27577148a4a29<br>
asm: removed unused code from pixel-a.asm<br></blockquote><div><br></div><div>queued.</div><div><br></div><div>For clarity, we should also prune unused function definitions from pixel.h - at least the functions that have been removed completely or never used by x265</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r ace5b9ee099d -r 1695371f63a6 source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm Mon Dec 02 11:25:00 2013 +0530<br>
+++ b/source/common/x86/pixel-a.asm Mon Dec 02 12:19:34 2013 +0530<br>
@@ -2242,182 +2242,6 @@<br>
movd eax, m7<br>
RET<br>
<br>
-;-----------------------------------------------------------------------------<br>
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,<br>
-; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )<br>
-;<br>
-; The maximum width this function can handle without risk of overflow is given<br>
-; in the following equation: (mmsize in bits)<br>
-;<br>
-; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2<br>
-;<br>
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane<br>
-; distortion levels it will take much more than that though.<br>
-;-----------------------------------------------------------------------------<br>
-%if HIGH_BIT_DEPTH<br>
-%macro SSD_NV12 0<br>
-cglobal pixel_ssd_nv12_core, 6,7,7<br>
- shl r4d, 2<br>
- FIX_STRIDES r1, r3<br>
- add r0, r4<br>
- add r2, r4<br>
- xor r6, r6<br>
- pxor m4, m4<br>
- pxor m5, m5<br>
- pxor m6, m6<br>
-.loopy:<br>
- mov r6, r4<br>
- neg r6<br>
- pxor m2, m2<br>
- pxor m3, m3<br>
-.loopx:<br>
- mova m0, [r0+r6]<br>
- mova m1, [r0+r6+mmsize]<br>
- psubw m0, [r2+r6]<br>
- psubw m1, [r2+r6+mmsize]<br>
- PSHUFLW m0, m0, q3120<br>
- PSHUFLW m1, m1, q3120<br>
-%if mmsize >= 16<br>
- pshufhw m0, m0, q3120<br>
- pshufhw m1, m1, q3120<br>
-%endif<br>
- pmaddwd m0, m0<br>
- pmaddwd m1, m1<br>
- paddd m2, m0<br>
- paddd m3, m1<br>
- add r6, 2*mmsize<br>
- jl .loopx<br>
-%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled<br>
- jz .no_overread<br>
- psubd m3, m1<br>
-.no_overread:<br>
-%endif<br>
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the<br>
- ; equation above, putting the width limit at 8208<br>
- punpckhdq m0, m2, m6<br>
- punpckhdq m1, m3, m6<br>
- punpckldq m2, m6<br>
- punpckldq m3, m6<br>
- paddq m3, m2<br>
- paddq m1, m0<br>
- paddq m4, m3<br>
- paddq m4, m1<br>
-%else ; unfortunately paddq is sse2<br>
- ; emulate 48 bit precision for mmx2 instead<br>
- mova m0, m2<br>
- mova m1, m3<br>
- punpcklwd m2, m6<br>
- punpcklwd m3, m6<br>
- punpckhwd m0, m6<br>
- punpckhwd m1, m6<br>
- paddd m3, m2<br>
- paddd m1, m0<br>
- paddd m4, m3<br>
- paddd m5, m1<br>
-%endif<br>
- add r0, r1<br>
- add r2, r3<br>
- dec r5d<br>
- jg .loopy<br>
- mov r3, r6m<br>
- mov r4, r7m<br>
-%if mmsize == 32<br>
- vextracti128 xm0, m4, 1<br>
- paddq xm4, xm0<br>
-%endif<br>
-%if mmsize >= 16<br>
- movq [r3], xm4<br>
- movhps [r4], xm4<br>
-%else ; fixup for mmx2<br>
- SBUTTERFLY dq, 4, 5, 0<br>
- mova m0, m4<br>
- psrld m4, 16<br>
- paddd m5, m4<br>
- pslld m0, 16<br>
- SBUTTERFLY dq, 0, 5, 4<br>
- psrlq m0, 16<br>
- psrlq m5, 16<br>
- movq [r3], m0<br>
- movq [r4], m5<br>
-%endif<br>
- RET<br>
-%endmacro ; SSD_NV12<br>
-%endif ; HIGH_BIT_DEPTH<br>
-<br>
-%if HIGH_BIT_DEPTH == 0<br>
-;-----------------------------------------------------------------------------<br>
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,<br>
-; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )<br>
-;<br>
-; This implementation can potentially overflow on image widths >= 11008 (or<br>
-; 6604 if interlaced), since it is called on blocks of height up to 12 (resp<br>
-; 20). At sane distortion levels it will take much more than that though.<br>
-;-----------------------------------------------------------------------------<br>
-%macro SSD_NV12 0<br>
-cglobal pixel_ssd_nv12_core, 6,7<br>
- add r4d, r4d<br>
- add r0, r4<br>
- add r2, r4<br>
- pxor m3, m3<br>
- pxor m4, m4<br>
- mova m5, [pw_00ff]<br>
-.loopy:<br>
- mov r6, r4<br>
- neg r6<br>
-.loopx:<br>
-%if mmsize == 32 ; only 16-byte alignment is guaranteed<br>
- movu m2, [r0+r6]<br>
- movu m1, [r2+r6]<br>
-%else<br>
- mova m2, [r0+r6]<br>
- mova m1, [r2+r6]<br>
-%endif<br>
- psubusb m0, m2, m1<br>
- psubusb m1, m2<br>
- por m0, m1<br>
- psrlw m2, m0, 8<br>
- pand m0, m5<br>
- pmaddwd m2, m2<br>
- pmaddwd m0, m0<br>
- paddd m3, m0<br>
- paddd m4, m2<br>
- add r6, mmsize<br>
- jl .loopx<br>
-%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled<br>
- jz .no_overread<br>
- pcmpeqb xm1, xm1<br>
- pandn m0, m1, m0 ; zero the lower half<br>
- pandn m2, m1, m2<br>
- psubd m3, m0<br>
- psubd m4, m2<br>
-.no_overread:<br>
-%endif<br>
- add r0, r1<br>
- add r2, r3<br>
- dec r5d<br>
- jg .loopy<br>
- mov r3, r6m<br>
- mov r4, r7m<br>
- HADDD m3, m0<br>
- HADDD m4, m0<br>
- pxor xm0, xm0<br>
- punpckldq xm3, xm0<br>
- punpckldq xm4, xm0<br>
- movq [r3], xm3<br>
- movq [r4], xm4<br>
- RET<br>
-%endmacro ; SSD_NV12<br>
-%endif ; !HIGH_BIT_DEPTH<br>
-<br>
-INIT_MMX mmx2<br>
-SSD_NV12<br>
-INIT_XMM sse2<br>
-SSD_NV12<br>
-INIT_XMM avx<br>
-SSD_NV12<br>
-INIT_YMM avx2<br>
-SSD_NV12<br>
-<br>
;=============================================================================<br>
; variance<br>
;=============================================================================<br>
@@ -2841,183 +2665,6 @@<br>
RET<br>
%endmacro<br>
<br>
-;-----------------------------------------------------------------------------<br>
-; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )<br>
-;-----------------------------------------------------------------------------<br>
-%macro VAR2_8x8_MMX 2<br>
-cglobal pixel_var2_8x%1, 5,6<br>
- FIX_STRIDES r1, r3<br>
- VAR_START 0<br>
- mov r5d, %1<br>
-.loop:<br>
-%if HIGH_BIT_DEPTH<br>
- mova m0, [r0]<br>
- mova m1, [r0+mmsize]<br>
- psubw m0, [r2]<br>
- psubw m1, [r2+mmsize]<br>
-%else ; !HIGH_BIT_DEPTH<br>
- movq m0, [r0]<br>
- movq m1, m0<br>
- movq m2, [r2]<br>
- movq m3, m2<br>
- punpcklbw m0, m7<br>
- punpckhbw m1, m7<br>
- punpcklbw m2, m7<br>
- punpckhbw m3, m7<br>
- psubw m0, m2<br>
- psubw m1, m3<br>
-%endif ; HIGH_BIT_DEPTH<br>
- paddw m5, m0<br>
- paddw m5, m1<br>
- pmaddwd m0, m0<br>
- pmaddwd m1, m1<br>
- paddd m6, m0<br>
- paddd m6, m1<br>
- add r0, r1<br>
- add r2, r3<br>
- dec r5d<br>
- jg .loop<br>
- VAR2_END %2, m5, m6<br>
-%endmacro<br>
-<br>
-%if ARCH_X86_64 == 0<br>
-INIT_MMX mmx2<br>
-VAR2_8x8_MMX 8, 6<br>
-VAR2_8x8_MMX 16, 7<br>
-%endif<br>
-<br>
-%macro VAR2_8x8_SSE2 2<br>
-cglobal pixel_var2_8x%1, 5,6,8<br>
- VAR_START 1<br>
- mov r5d, %1/2<br>
-.loop:<br>
-%if HIGH_BIT_DEPTH<br>
- mova m0, [r0]<br>
- mova m1, [r0+r1*2]<br>
- mova m2, [r2]<br>
- mova m3, [r2+r3*2]<br>
-%else ; !HIGH_BIT_DEPTH<br>
- movq m1, [r0]<br>
- movhps m1, [r0+r1]<br>
- movq m3, [r2]<br>
- movhps m3, [r2+r3]<br>
- DEINTB 0, 1, 2, 3, 7<br>
-%endif ; HIGH_BIT_DEPTH<br>
- psubw m0, m2<br>
- psubw m1, m3<br>
- paddw m5, m0<br>
- paddw m5, m1<br>
- pmaddwd m0, m0<br>
- pmaddwd m1, m1<br>
- paddd m6, m0<br>
- paddd m6, m1<br>
- lea r0, [r0+r1*2*SIZEOF_PIXEL]<br>
- lea r2, [r2+r3*2*SIZEOF_PIXEL]<br>
- dec r5d<br>
- jg .loop<br>
- VAR2_END %2, m5, m6<br>
-%endmacro<br>
-<br>
-INIT_XMM sse2<br>
-VAR2_8x8_SSE2 8, 6<br>
-VAR2_8x8_SSE2 16, 7<br>
-<br>
-%if HIGH_BIT_DEPTH == 0<br>
-%macro VAR2_8x8_SSSE3 2<br>
-cglobal pixel_var2_8x%1, 5,6,8<br>
- pxor m5, m5 ; sum<br>
- pxor m6, m6 ; sum squared<br>
- mova m7, [hsub_mul]<br>
- mov r5d, %1/4<br>
-.loop:<br>
- movq m0, [r0]<br>
- movq m2, [r2]<br>
- movq m1, [r0+r1]<br>
- movq m3, [r2+r3]<br>
- lea r0, [r0+r1*2]<br>
- lea r2, [r2+r3*2]<br>
- punpcklbw m0, m2<br>
- punpcklbw m1, m3<br>
- movq m2, [r0]<br>
- movq m3, [r2]<br>
- punpcklbw m2, m3<br>
- movq m3, [r0+r1]<br>
- movq m4, [r2+r3]<br>
- punpcklbw m3, m4<br>
- pmaddubsw m0, m7<br>
- pmaddubsw m1, m7<br>
- pmaddubsw m2, m7<br>
- pmaddubsw m3, m7<br>
- paddw m5, m0<br>
- paddw m5, m1<br>
- paddw m5, m2<br>
- paddw m5, m3<br>
- pmaddwd m0, m0<br>
- pmaddwd m1, m1<br>
- pmaddwd m2, m2<br>
- pmaddwd m3, m3<br>
- paddd m6, m0<br>
- paddd m6, m1<br>
- paddd m6, m2<br>
- paddd m6, m3<br>
- lea r0, [r0+r1*2]<br>
- lea r2, [r2+r3*2]<br>
- dec r5d<br>
- jg .loop<br>
- VAR2_END %2, m5, m6<br>
-%endmacro<br>
-<br>
-INIT_XMM ssse3<br>
-VAR2_8x8_SSSE3 8, 6<br>
-VAR2_8x8_SSSE3 16, 7<br>
-INIT_XMM xop<br>
-VAR2_8x8_SSSE3 8, 6<br>
-VAR2_8x8_SSSE3 16, 7<br>
-<br>
-%macro VAR2_8x8_AVX2 2<br>
-cglobal pixel_var2_8x%1, 5,6,6<br>
- pxor m3, m3 ; sum<br>
- pxor m4, m4 ; sum squared<br>
- mova m5, [hsub_mul]<br>
- mov r5d, %1/4<br>
-.loop:<br>
- movq xm0, [r0]<br>
- movq xm1, [r2]<br>
- vinserti128 m0, m0, [r0+r1], 1<br>
- vinserti128 m1, m1, [r2+r3], 1<br>
- lea r0, [r0+r1*2]<br>
- lea r2, [r2+r3*2]<br>
- punpcklbw m0, m1<br>
- movq xm1, [r0]<br>
- movq xm2, [r2]<br>
- vinserti128 m1, m1, [r0+r1], 1<br>
- vinserti128 m2, m2, [r2+r3], 1<br>
- lea r0, [r0+r1*2]<br>
- lea r2, [r2+r3*2]<br>
- punpcklbw m1, m2<br>
- pmaddubsw m0, m5<br>
- pmaddubsw m1, m5<br>
- paddw m3, m0<br>
- paddw m3, m1<br>
- pmaddwd m0, m0<br>
- pmaddwd m1, m1<br>
- paddd m4, m0<br>
- paddd m4, m1<br>
- dec r5d<br>
- jg .loop<br>
- vextracti128 xm0, m3, 1<br>
- vextracti128 xm1, m4, 1<br>
- paddw xm3, xm0<br>
- paddd xm4, xm1<br>
- VAR2_END %2, xm3, xm4<br>
-%endmacro<br>
-<br>
-INIT_YMM avx2<br>
-VAR2_8x8_AVX2 8, 6<br>
-VAR2_8x8_AVX2 16, 7<br>
-<br>
-%endif ; !HIGH_BIT_DEPTH<br>
-<br>
;=============================================================================<br>
; SATD<br>
;=============================================================================<br>
@@ -9009,446 +8656,6 @@<br>
INIT_XMM avx<br>
SSIM<br>
<br>
-;-----------------------------------------------------------------------------<br>
-; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );<br>
-;-----------------------------------------------------------------------------<br>
-%macro ASD8 0<br>
-cglobal pixel_asd8, 5,5<br>
- pxor m0, m0<br>
- pxor m1, m1<br>
-.loop:<br>
-%if HIGH_BIT_DEPTH<br>
- paddw m0, [r0]<br>
- paddw m1, [r2]<br>
- paddw m0, [r0+2*r1]<br>
- paddw m1, [r2+2*r3]<br>
- lea r0, [r0+4*r1]<br>
- paddw m0, [r0]<br>
- paddw m1, [r2+4*r3]<br>
- lea r2, [r2+4*r3]<br>
- paddw m0, [r0+2*r1]<br>
- paddw m1, [r2+2*r3]<br>
- lea r0, [r0+4*r1]<br>
- lea r2, [r2+4*r3]<br>
-%else<br>
- movq m2, [r0]<br>
- movq m3, [r2]<br>
- movhps m2, [r0+r1]<br>
- movhps m3, [r2+r3]<br>
- lea r0, [r0+2*r1]<br>
- psadbw m2, m1<br>
- psadbw m3, m1<br>
- movq m4, [r0]<br>
- movq m5, [r2+2*r3]<br>
- lea r2, [r2+2*r3]<br>
- movhps m4, [r0+r1]<br>
- movhps m5, [r2+r3]<br>
- lea r0, [r0+2*r1]<br>
- paddw m0, m2<br>
- psubw m0, m3<br>
- psadbw m4, m1<br>
- psadbw m5, m1<br>
- lea r2, [r2+2*r3]<br>
- paddw m0, m4<br>
- psubw m0, m5<br>
-%endif<br>
- sub r4d, 4<br>
- jg .loop<br>
-%if HIGH_BIT_DEPTH<br>
- psubw m0, m1<br>
- HADDW m0, m1<br>
- ABSD m1, m0<br>
-%else<br>
- movhlps m1, m0<br>
- paddw m0, m1<br>
- ABSW m1, m0<br>
-%endif<br>
- movd eax, m1<br>
- RET<br>
-%endmacro<br>
-<br>
-INIT_XMM sse2<br>
-ASD8<br>
-INIT_XMM ssse3<br>
-ASD8<br>
-%if HIGH_BIT_DEPTH<br>
-INIT_XMM xop<br>
-ASD8<br>
-%endif<br>
-<br>
-;=============================================================================<br>
-; Successive Elimination ADS<br>
-;=============================================================================<br>
-<br>
-%macro ADS_START 0<br>
-%if UNIX64<br>
- movsxd r5, r5d<br>
-%else<br>
- mov r5d, r5m<br>
-%endif<br>
- mov r0d, r5d<br>
- lea r6, [r4+r5+(mmsize-1)]<br>
- and r6, ~(mmsize-1)<br>
- shl r2d, 1<br>
-%endmacro<br>
-<br>
-%macro ADS_END 1 ; unroll_size<br>
- add r1, 8*%1<br>
- add r3, 8*%1<br>
- add r6, 4*%1<br>
- sub r0d, 4*%1<br>
- jg .loop<br>
- WIN64_RESTORE_XMM rsp<br>
-%if mmsize==32<br>
- vzeroupper<br>
-%endif<br>
- lea r6, [r4+r5+(mmsize-1)]<br>
- and r6, ~(mmsize-1)<br>
-%if cpuflag(ssse3)<br>
- jmp ads_mvs_ssse3<br>
-%else<br>
- jmp ads_mvs_mmx<br>
-%endif<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,<br>
-; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )<br>
-;-----------------------------------------------------------------------------<br>
-INIT_MMX mmx2<br>
-cglobal pixel_ads4, 5,7<br>
- mova m6, [r0]<br>
- mova m4, [r0+8]<br>
- pshufw m7, m6, 0<br>
- pshufw m6, m6, q2222<br>
- pshufw m5, m4, 0<br>
- pshufw m4, m4, q2222<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+16]<br>
- psubw m0, m7<br>
- psubw m1, m6<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- movu m2, [r1+r2]<br>
- movu m3, [r1+r2+16]<br>
- psubw m2, m5<br>
- psubw m3, m4<br>
- paddw m0, m1<br>
- ABSW m2, m2, m1<br>
- ABSW m3, m3, m1<br>
- paddw m0, m2<br>
- paddw m0, m3<br>
- pshufw m1, r6m, 0<br>
- paddusw m0, [r3]<br>
- psubusw m1, m0<br>
- packsswb m1, m1<br>
- movd [r6], m1<br>
- ADS_END 1<br>
-<br>
-cglobal pixel_ads2, 5,7<br>
- mova m6, [r0]<br>
- pshufw m5, r6m, 0<br>
- pshufw m7, m6, 0<br>
- pshufw m6, m6, q2222<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+r2]<br>
- psubw m0, m7<br>
- psubw m1, m6<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- paddw m0, m1<br>
- paddusw m0, [r3]<br>
- mova m4, m5<br>
- psubusw m4, m0<br>
- packsswb m4, m4<br>
- movd [r6], m4<br>
- ADS_END 1<br>
-<br>
-cglobal pixel_ads1, 5,7<br>
- pshufw m7, [r0], 0<br>
- pshufw m6, r6m, 0<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+8]<br>
- psubw m0, m7<br>
- psubw m1, m7<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- paddusw m0, [r3]<br>
- paddusw m1, [r3+8]<br>
- mova m4, m6<br>
- mova m5, m6<br>
- psubusw m4, m0<br>
- psubusw m5, m1<br>
- packsswb m4, m5<br>
- mova [r6], m4<br>
- ADS_END 2<br>
-<br>
-%macro ADS_XMM 0<br>
-%if mmsize==32<br>
-cglobal pixel_ads4, 5,7,8<br>
- vpbroadcastw m7, [r0+ 0]<br>
- vpbroadcastw m6, [r0+ 4]<br>
- vpbroadcastw m5, [r0+ 8]<br>
- vpbroadcastw m4, [r0+12]<br>
-%else<br>
-cglobal pixel_ads4, 5,7,12<br>
- mova m4, [r0]<br>
- pshuflw m7, m4, q0000<br>
- pshuflw m6, m4, q2222<br>
- pshufhw m5, m4, q0000<br>
- pshufhw m4, m4, q2222<br>
- punpcklqdq m7, m7<br>
- punpcklqdq m6, m6<br>
- punpckhqdq m5, m5<br>
- punpckhqdq m4, m4<br>
-%endif<br>
-%if ARCH_X86_64 && mmsize == 16<br>
- movd m8, r6m<br>
- SPLATW m8, m8<br>
- ADS_START<br>
- movu m10, [r1]<br>
- movu m11, [r1+r2]<br>
-.loop:<br>
- psubw m0, m10, m7<br>
- movu m10, [r1+16]<br>
- psubw m1, m10, m6<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- psubw m2, m11, m5<br>
- movu m11, [r1+r2+16]<br>
- paddw m0, m1<br>
- psubw m3, m11, m4<br>
- movu m9, [r3]<br>
- ABSW m2, m2, m1<br>
- ABSW m3, m3, m1<br>
- paddw m0, m2<br>
- paddw m0, m3<br>
- paddusw m0, m9<br>
- psubusw m1, m8, m0<br>
-%else<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+16]<br>
- psubw m0, m7<br>
- psubw m1, m6<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- movu m2, [r1+r2]<br>
- movu m3, [r1+r2+16]<br>
- psubw m2, m5<br>
- psubw m3, m4<br>
- paddw m0, m1<br>
- ABSW m2, m2, m1<br>
- ABSW m3, m3, m1<br>
- paddw m0, m2<br>
- paddw m0, m3<br>
- movu m2, [r3]<br>
-%if mmsize==32<br>
- vpbroadcastw m1, r6m<br>
-%else<br>
- movd m1, r6m<br>
- pshuflw m1, m1, 0<br>
- punpcklqdq m1, m1<br>
-%endif<br>
- paddusw m0, m2<br>
- psubusw m1, m0<br>
-%endif ; ARCH<br>
- packsswb m1, m1<br>
-%if mmsize==32<br>
- vpermq m1, m1, q3120<br>
- mova [r6], xm1<br>
-%else<br>
- movh [r6], m1<br>
-%endif<br>
- ADS_END mmsize/8<br>
-<br>
-cglobal pixel_ads2, 5,7,8<br>
-%if mmsize==32<br>
- vpbroadcastw m7, [r0+0]<br>
- vpbroadcastw m6, [r0+4]<br>
- vpbroadcastw m5, r6m<br>
-%else<br>
- movq m6, [r0]<br>
- movd m5, r6m<br>
- pshuflw m7, m6, 0<br>
- pshuflw m6, m6, q2222<br>
- pshuflw m5, m5, 0<br>
- punpcklqdq m7, m7<br>
- punpcklqdq m6, m6<br>
- punpcklqdq m5, m5<br>
-%endif<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+r2]<br>
- psubw m0, m7<br>
- psubw m1, m6<br>
- movu m4, [r3]<br>
- ABSW m0, m0, m2<br>
- ABSW m1, m1, m3<br>
- paddw m0, m1<br>
- paddusw m0, m4<br>
- psubusw m1, m5, m0<br>
- packsswb m1, m1<br>
-%if mmsize==32<br>
- vpermq m1, m1, q3120<br>
- mova [r6], xm1<br>
-%else<br>
- movh [r6], m1<br>
-%endif<br>
- ADS_END mmsize/8<br>
-<br>
-cglobal pixel_ads1, 5,7,8<br>
-%if mmsize==32<br>
- vpbroadcastw m7, [r0]<br>
- vpbroadcastw m6, r6m<br>
-%else<br>
- movd m7, [r0]<br>
- movd m6, r6m<br>
- pshuflw m7, m7, 0<br>
- pshuflw m6, m6, 0<br>
- punpcklqdq m7, m7<br>
- punpcklqdq m6, m6<br>
-%endif<br>
- ADS_START<br>
-.loop:<br>
- movu m0, [r1]<br>
- movu m1, [r1+mmsize]<br>
- psubw m0, m7<br>
- psubw m1, m7<br>
- movu m2, [r3]<br>
- movu m3, [r3+mmsize]<br>
- ABSW m0, m0, m4<br>
- ABSW m1, m1, m5<br>
- paddusw m0, m2<br>
- paddusw m1, m3<br>
- psubusw m4, m6, m0<br>
- psubusw m5, m6, m1<br>
- packsswb m4, m5<br>
-%if mmsize==32<br>
- vpermq m4, m4, q3120<br>
-%endif<br>
- mova [r6], m4<br>
- ADS_END mmsize/4<br>
-%endmacro<br>
-<br>
-INIT_XMM sse2<br>
-ADS_XMM<br>
-INIT_XMM ssse3<br>
-ADS_XMM<br>
-INIT_XMM avx<br>
-ADS_XMM<br>
-INIT_YMM avx2<br>
-ADS_XMM<br>
-<br>
-; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )<br>
-; {<br>
-; int nmv=0, i, j;<br>
-; *(uint32_t*)(masks+width) = 0;<br>
-; for( i=0; i<width; i+=8 )<br>
-; {<br>
-; uint64_t mask = *(uint64_t*)(masks+i);<br>
-; if( !mask ) continue;<br>
-; for( j=0; j<8; j++ )<br>
-; if( mask & (255<<j*8) )<br>
-; mvs[nmv++] = i+j;<br>
-; }<br>
-; return nmv;<br>
-; }<br>
-<br>
-%macro TEST 1<br>
- mov [r4+r0*2], r1w<br>
- test r2d, 0xff<<(%1*8)<br>
- setne r3b<br>
- add r0d, r3d<br>
- inc r1d<br>
-%endmacro<br>
-<br>
-INIT_MMX mmx<br>
-cglobal pixel_ads_mvs, 0,7,0<br>
-ads_mvs_mmx:<br>
- ; mvs = r4<br>
- ; masks = r6<br>
- ; width = r5<br>
- ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)<br>
- xor r0d, r0d<br>
- xor r1d, r1d<br>
- mov [r6+r5], r0d<br>
- jmp .loopi<br>
-ALIGN 16<br>
-.loopi0:<br>
- add r1d, 8<br>
- cmp r1d, r5d<br>
- jge .end<br>
-.loopi:<br>
- mov r2, [r6+r1]<br>
-%if ARCH_X86_64<br>
- test r2, r2<br>
-%else<br>
- mov r3, r2<br>
- add r3d, [r6+r1+4]<br>
-%endif<br>
- jz .loopi0<br>
- xor r3d, r3d<br>
- TEST 0<br>
- TEST 1<br>
- TEST 2<br>
- TEST 3<br>
-%if ARCH_X86_64<br>
- shr r2, 32<br>
-%else<br>
- mov r2d, [r6+r1]<br>
-%endif<br>
- TEST 0<br>
- TEST 1<br>
- TEST 2<br>
- TEST 3<br>
- cmp r1d, r5d<br>
- jl .loopi<br>
-.end:<br>
- movifnidn eax, r0d<br>
- RET<br>
-<br>
-INIT_XMM ssse3<br>
-cglobal pixel_ads_mvs, 0,7,0<br>
-ads_mvs_ssse3:<br>
- mova m3, [pw_8]<br>
- mova m4, [pw_76543210]<br>
- pxor m5, m5<br>
- add r5, r6<br>
- xor r0d, r0d ; nmv<br>
- mov [r5], r0d<br>
-%ifdef PIC<br>
- lea r1, [$$]<br>
- %define GLOBAL +r1-$$<br>
-%else<br>
- %define GLOBAL<br>
-%endif<br>
-.loop:<br>
- movh m0, [r6]<br>
- pcmpeqb m0, m5<br>
- pmovmskb r2d, m0<br>
- xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)<br>
- movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt<br>
- add r2d, r2d<br>
- ; shuffle counters based on mv mask<br>
- pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]<br>
- movu [r4+r0*2], m2<br>
- add r0d, r3d<br>
- paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}<br>
- add r6, 8<br>
- cmp r6, r5<br>
- jl .loop<br>
- movifnidn eax, r0d<br>
- RET<br>
-<br>
;-----------------------------------------------------------------<br>
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)<br>
;-----------------------------------------------------------------<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>