[x264-devel] Fix illegal instruction in high bit depth ssd_nv12_mmxext
Holger Lubitz
git at videolan.org
Wed Jan 26 02:56:54 CET 2011
x264 | branch: master | Holger Lubitz <holger at lubitz.org> | Fri Jan 21 17:17:29 2011 +0100| [b4865a6fe6795d3e882214d4c1ff5f3a5dafbab2] | committer: Jason Garrett-Glaser
Fix illegal instruction in high bit depth ssd_nv12_mmxext
Unfortunately paddq isn't available in mmxext, only in sse2 and up.
Also fixes to actually allow widths up to 16416/32832 without overflow.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b4865a6fe6795d3e882214d4c1ff5f3a5dafbab2
---
common/x86/pixel-a.asm | 55 ++++++++++++++++++++++++++++++++++-------------
1 files changed, 40 insertions(+), 15 deletions(-)
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 1e20c1c..19d297d 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -387,7 +387,7 @@ SSD 4, 8, ssse3
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; The maximum width this function can handle without risk of overflow is given
-; in the following equation:
+; in the following equation: (mmsize in bits)
;
; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
;
@@ -404,7 +404,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
xor r6, r6
pxor m4, m4
pxor m5, m5
- mova m6, [sq_0f]
+ pxor m6, m6
.loopy:
mov r6, r4
neg r6
@@ -415,7 +415,7 @@ cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
mova m1, [r0+r6+mmsize]
psubw m0, [r2+r6]
psubw m1, [r2+r6+mmsize]
-%if mmsize == 8
+%if mmsize==8
pshufw m0, m0, 11011000b
pshufw m1, m1, 11011000b
%else
@@ -430,27 +430,52 @@ cglobal pixel_ssd_nv12_core_%1, 6,7,7*(mmsize/16)
paddd m3, m1
add r6, 2*mmsize
jl .loopx
-%if mmsize == 8
- SBUTTERFLY dq, 2, 3, 1
-%else
- mova m1, m2
- shufps m2, m3, 10001000b
- shufps m3, m1, 11011101b
-%endif
- HADDD m2, m1
- HADDD m3, m1
- pand m2, m6
- pand m3, m6
- paddq m4, m2
- paddq m5, m3
+%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
+ ; equation above, putting the width limit at 8208
+ mova m0, m2
+ mova m1, m3
+ punpckldq m2, m6
+ punpckldq m3, m6
+ punpckhdq m0, m6
+ punpckhdq m1, m6
+ paddq m3, m2
+ paddq m1, m0
+ paddq m4, m3
+ paddq m4, m1
+%else ; unfortunately paddq is sse2
+ ; emulate 48 bit precision for mmxext instead
+ mova m0, m2
+ mova m1, m3
+ punpcklwd m2, m6
+ punpcklwd m3, m6
+ punpckhwd m0, m6
+ punpckhwd m1, m6
+ paddd m3, m2
+ paddd m1, m0
+ paddd m4, m3
+ paddd m5, m1
+%endif
add r0, r1
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
+%if mmsize==16
movq [r3], m4
+ movhps [r4], m4
+%else ; fixup for mmxext
+ SBUTTERFLY dq, 4, 5, 0
+ mova m0, m4
+ psrld m4, 16
+ paddd m5, m4
+ pslld m0, 16
+ SBUTTERFLY dq, 0, 5, 4
+ psrlq m0, 16
+ psrlq m5, 16
+ movq [r3], m0
movq [r4], m5
+%endif
RET
%endmacro ; SSD_NV12
%endif ; HIGH_BIT_DEPTH
More information about the x264-devel
mailing list