[x265] [PATCH 1 of 2] fix miss emms in mmx functions
Min Chen
chenm003 at 163.com
Wed Jul 30 03:56:58 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1406685333 25200
# Node ID 9fbda07104f6e7c03c608320a6b88ed877074e25
# Parent a9678988dda2aea1f8d8babf05de7717896946f3
fix miss emms in mmx functions
diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Jul 29 23:14:42 2014 +0530
+++ b/source/common/x86/mc-a.asm Tue Jul 29 18:55:33 2014 -0700
@@ -1983,7 +1983,7 @@
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
jg .height_loop
- %ifidn movu,movq ; detect MMX
+ %if (mmsize == 8)
EMMS
%endif
RET
@@ -2422,6 +2422,9 @@
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
.fast:
@@ -2432,6 +2435,9 @@
lea r2, [r2+r3*2]
sub r5d, 2
jg .fastloop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endif
%endmacro
@@ -2517,6 +2523,9 @@
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -2851,6 +2860,9 @@
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -2886,6 +2898,9 @@
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -2926,6 +2941,9 @@
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
cglobal pixel_avg2_w16_mmx2, 6,7
@@ -2960,6 +2978,9 @@
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
cglobal pixel_avg2_w18_mmx2, 6,7
@@ -2984,6 +3005,9 @@
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%macro PIXEL_AVG_W18 0
@@ -3012,6 +3036,9 @@
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -3043,6 +3070,9 @@
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -3071,6 +3101,9 @@
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -3103,6 +3136,7 @@
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
+ emms
RET
INIT_XMM
@@ -3146,6 +3180,7 @@
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
+ emms
RET
INIT_YMM avx2
@@ -3222,6 +3257,7 @@
add r0, r1
dec r5d
jg .height_loop
+ emms
RET
%endmacro
@@ -3411,6 +3447,7 @@
lea r0, [r0+r1*4]
.end:
COPY1 r4, r5
+ emms
RET
%macro MC_COPY 1
@@ -3426,6 +3463,9 @@
lea r0, [r0+r1*4]
sub r4d, 4
jg .height_loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endif
%endmacro
diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Tue Jul 29 23:14:42 2014 +0530
+++ b/source/common/x86/mc-a2.asm Tue Jul 29 18:55:33 2014 -0700
@@ -338,6 +338,9 @@
add r4, r5
dec dword r7m
jg .loopy
+%if (mmsize == 8)
+ emms
+%endif
RET
;-----------------------------------------------------------------------------
@@ -353,6 +356,9 @@
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
+%if (mmsize == 8)
+ emms
+%endif
RET
;-----------------------------------------------------------------------------
@@ -368,6 +374,9 @@
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro ; PLANE_DEINTERLEAVE
@@ -433,6 +442,9 @@
sub r2d, 4*mmsize
jg .copy4
.ret:
+%if (mmsize == 8)
+ emms
+%endif
REP_RET
%endmacro
diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Jul 29 23:14:42 2014 +0530
+++ b/source/common/x86/pixel-a.asm Tue Jul 29 18:55:33 2014 -0700
@@ -320,6 +320,9 @@
movd eax, m0
and eax, 0xffff
%endif ; HIGH_BIT_DEPTH
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -373,6 +376,9 @@
movd r4, m2
%endrep
movifnidn eax, r6d
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -393,6 +399,9 @@
call pixel_satd_16x4_internal_mmx2
HADDUW m0, m1
movd eax, m0
+%if (mmsize == 8)
+ emms
+%endif
RET
cglobal pixel_satd_16x8, 4,6
diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jul 29 23:14:42 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Jul 29 18:55:33 2014 -0700
@@ -4177,6 +4177,9 @@
movd eax, m5
movd edx, m6
%endif
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
diff -r a9678988dda2 -r 9fbda07104f6 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Tue Jul 29 23:14:42 2014 +0530
+++ b/source/common/x86/sad-a.asm Tue Jul 29 18:55:33 2014 -0700
@@ -89,15 +89,19 @@
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD 2
-cglobal pixel_sad_%1x%2_mmx2, 4,4
+cglobal pixel_sad_%1x%2, 4,4
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
%endrep
movd eax, mm0
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
+INIT_MMX mmx2
SAD 16, 16
SAD 16, 8
SAD 8, 16
@@ -106,7 +110,7 @@
SAD 4, 16
SAD 4, 8
SAD 4, 4
-
+%undef SAD
;=============================================================================
@@ -117,6 +121,9 @@
movhlps m1, m0
paddw m0, m1
movd eax, m0
+%if (cpuflags <= cpuflags_mmx2)
+ %error SSE2 macro use by MMX function!
+%endif
RET
%endmacro
@@ -833,9 +840,9 @@
paddw m0, m2
%endmacro
-INIT_XMM
+INIT_XMM sse2
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
-cglobal pixel_sad_8x16_sse2, 4,4
+cglobal pixel_sad_8x16, 4,4
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
@@ -1046,6 +1053,9 @@
movd [r0+4], mm1
movd [r0+8], mm2
%endif
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -1055,6 +1065,9 @@
movd [r0+4], mm1
movd [r0+8], mm2
movd [r0+12], mm3
+%if (mmsize == 8)
+ emms
+%endif
RET
%endmacro
@@ -2384,7 +2397,7 @@
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
-cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
+cglobal pixel_sad_x%1_%2x%3, %1+2, %1+2
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
@@ -2392,7 +2405,7 @@
SAD_X%1_END
%endmacro
-INIT_MMX
+INIT_MMX mmx2
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
@@ -3472,6 +3485,7 @@
dec r4
jg .loop
movd eax, mm0
+ emms
RET
%endmacro
@@ -3498,6 +3512,7 @@
dec r4
jg .loop
movd eax, mm0
+ emms
RET
%endmacro
More information about the x265-devel
mailing list