[x265] [PATCH 1 of 2] fix miss emms in mmx functions
Steve Borho
steve at borho.org
Wed Jul 30 03:30:43 CEST 2014
On 07/29, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1406685333 25200
> # Node ID 9fbda07104f6e7c03c608320a6b88ed877074e25
> # Parent a9678988dda2aea1f8d8babf05de7717896946f3
> fix miss emms in mmx functions
As you know I'm of a mixed mind about this; I rather prefer x264's
method of only using EMMS prior to performing any floating point math or
just before returing from API calls.
But sadly the fact is today we still have too many floating point
operations scattered about to make this robust, particularly in RDO and
rate control.
If only there was a way to make the CPU trigger an exception when we
miss an EMMS and break a floating point operation; then we could debug
this sanely.
What are the performance implications of this patch?
> diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/mc-a.asm
> --- a/source/common/x86/mc-a.asm Tue Jul 29 23:14:42 2014 +0530
> +++ b/source/common/x86/mc-a.asm Tue Jul 29 18:55:33 2014 -0700
> @@ -1983,7 +1983,7 @@
> lea t0, [t0+t1*2*SIZEOF_PIXEL]
> sub eax, 2
> jg .height_loop
> - %ifidn movu,movq ; detect MMX
> + %if (mmsize == 8)
> EMMS
> %endif
> RET
> @@ -2422,6 +2422,9 @@
> lea r2, [r2+r3*2]
> sub r5d, 2
> jg .loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
> .fast:
> @@ -2432,6 +2435,9 @@
> lea r2, [r2+r3*2]
> sub r5d, 2
> jg .fastloop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endif
> %endmacro
> @@ -2517,6 +2523,9 @@
> lea r2, [r2+r3*2]
> sub r5d, 2
> jg .loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -2851,6 +2860,9 @@
> lea r0, [r0+r1*4]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -2886,6 +2898,9 @@
> lea r0, [r0+r1*4]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -2926,6 +2941,9 @@
> lea r0, [r0+r1*2*2]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> cglobal pixel_avg2_w16_mmx2, 6,7
> @@ -2960,6 +2978,9 @@
> lea r0, [r0+r1*2*2]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> cglobal pixel_avg2_w18_mmx2, 6,7
> @@ -2984,6 +3005,9 @@
> lea r0, [r0+r1*2]
> dec r5d
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> %macro PIXEL_AVG_W18 0
> @@ -3012,6 +3036,9 @@
> lea r0, [r0+r1*2]
> dec r5d
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -3043,6 +3070,9 @@
> lea r0, [r0+r1*2]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -3071,6 +3101,9 @@
> lea r0, [r0+r1*2]
> sub r5d, 2
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -3103,6 +3136,7 @@
> lea r0, [r0+r1*2]
> sub r5d, 2
> jg .height_loop
> + emms
> RET
>
> INIT_XMM
> @@ -3146,6 +3180,7 @@
> lea r0, [r0+r1*2]
> sub r5d, 2
> jg .height_loop
> + emms
> RET
>
> INIT_YMM avx2
> @@ -3222,6 +3257,7 @@
> add r0, r1
> dec r5d
> jg .height_loop
> + emms
> RET
> %endmacro
>
> @@ -3411,6 +3447,7 @@
> lea r0, [r0+r1*4]
> .end:
> COPY1 r4, r5
> + emms
> RET
>
> %macro MC_COPY 1
> @@ -3426,6 +3463,9 @@
> lea r0, [r0+r1*4]
> sub r4d, 4
> jg .height_loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endif
> %endmacro
> diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/mc-a2.asm
> --- a/source/common/x86/mc-a2.asm Tue Jul 29 23:14:42 2014 +0530
> +++ b/source/common/x86/mc-a2.asm Tue Jul 29 18:55:33 2014 -0700
> @@ -338,6 +338,9 @@
> add r4, r5
> dec dword r7m
> jg .loopy
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> ;-----------------------------------------------------------------------------
> @@ -353,6 +356,9 @@
> lea r1, [r1+r2*2]
> sub r3d, 2
> jg .loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> ;-----------------------------------------------------------------------------
> @@ -368,6 +374,9 @@
> lea r1, [r1+r2*2]
> sub r3d, 2
> jg .loop
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro ; PLANE_DEINTERLEAVE
>
> @@ -433,6 +442,9 @@
> sub r2d, 4*mmsize
> jg .copy4
> .ret:
> +%if (mmsize == 8)
> + emms
> +%endif
> REP_RET
> %endmacro
>
> diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Tue Jul 29 23:14:42 2014 +0530
> +++ b/source/common/x86/pixel-a.asm Tue Jul 29 18:55:33 2014 -0700
> @@ -320,6 +320,9 @@
> movd eax, m0
> and eax, 0xffff
> %endif ; HIGH_BIT_DEPTH
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -373,6 +376,9 @@
> movd r4, m2
> %endrep
> movifnidn eax, r6d
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -393,6 +399,9 @@
> call pixel_satd_16x4_internal_mmx2
> HADDUW m0, m1
> movd eax, m0
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
>
> cglobal pixel_satd_16x8, 4,6
> diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Jul 29 23:14:42 2014 +0530
> +++ b/source/common/x86/pixel-util8.asm Tue Jul 29 18:55:33 2014 -0700
> @@ -4177,6 +4177,9 @@
> movd eax, m5
> movd edx, m6
> %endif
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm Tue Jul 29 23:14:42 2014 +0530
> +++ b/source/common/x86/sad-a.asm Tue Jul 29 18:55:33 2014 -0700
> @@ -89,15 +89,19 @@
> ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
> ;-----------------------------------------------------------------------------
> %macro SAD 2
> -cglobal pixel_sad_%1x%2_mmx2, 4,4
> +cglobal pixel_sad_%1x%2, 4,4
> pxor mm0, mm0
> %rep %2/2
> SAD_INC_2x%1P
> %endrep
> movd eax, mm0
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> +INIT_MMX mmx2
> SAD 16, 16
> SAD 16, 8
> SAD 8, 16
> @@ -106,7 +110,7 @@
> SAD 4, 16
> SAD 4, 8
> SAD 4, 4
> -
> +%undef SAD
>
>
> ;=============================================================================
> @@ -117,6 +121,9 @@
> movhlps m1, m0
> paddw m0, m1
> movd eax, m0
> +%if (cpuflags <= cpuflags_mmx2)
> + %error SSE2 macro use by MMX function!
> +%endif
> RET
> %endmacro
>
> @@ -833,9 +840,9 @@
> paddw m0, m2
> %endmacro
>
> -INIT_XMM
> +INIT_XMM sse2
> ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
> -cglobal pixel_sad_8x16_sse2, 4,4
> +cglobal pixel_sad_8x16, 4,4
> SAD_INC_4x8P_SSE 0
> SAD_INC_4x8P_SSE 1
> SAD_INC_4x8P_SSE 1
> @@ -1046,6 +1053,9 @@
> movd [r0+4], mm1
> movd [r0+8], mm2
> %endif
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -1055,6 +1065,9 @@
> movd [r0+4], mm1
> movd [r0+8], mm2
> movd [r0+12], mm3
> +%if (mmsize == 8)
> + emms
> +%endif
> RET
> %endmacro
>
> @@ -2384,7 +2397,7 @@
> ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
> ;-----------------------------------------------------------------------------
> %macro SAD_X 3
> -cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
> +cglobal pixel_sad_x%1_%2x%3, %1+2, %1+2
> SAD_X%1_2x%2P 1
> %rep %3/2-1
> SAD_X%1_2x%2P 0
> @@ -2392,7 +2405,7 @@
> SAD_X%1_END
> %endmacro
>
> -INIT_MMX
> +INIT_MMX mmx2
> SAD_X 3, 16, 16
> SAD_X 3, 16, 8
> SAD_X 3, 8, 16
> @@ -3472,6 +3485,7 @@
> dec r4
> jg .loop
> movd eax, mm0
> + emms
> RET
> %endmacro
>
> @@ -3498,6 +3512,7 @@
> dec r4
> jg .loop
> movd eax, mm0
> + emms
> RET
> %endmacro
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list