[x264-devel] Fix memory overwrite in x264_deblock_h_chroma_mbaff_sse2
Anton Mitrofanov
git at videolan.org
Thu Mar 13 21:23:50 CET 2014
x264 | branch: master | Anton Mitrofanov <BugMaster at narod.ru> | Mon Mar 10 03:22:57 2014 +0400| [9a7a0bf0e5fe4721fc9aa5fc095d2ce820d40513] | committer: Jason Garrett-Glaser
Fix memory overwrite in x264_deblock_h_chroma_mbaff_sse2
Fixes possible corruption with MBAFF+sliced threads.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9a7a0bf0e5fe4721fc9aa5fc095d2ce820d40513
---
common/x86/deblock-a.asm | 148 +++++++++++++++++++++++++++-------------------
1 file changed, 88 insertions(+), 60 deletions(-)
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 609f60c..ebb5aab 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -804,35 +804,6 @@ DEBLOCK_LUMA_INTRA
%define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
-; in: 8 rows of 4 bytes in %4..%11
-; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 11
- movh m0, %4
- movh m2, %5
- movh m1, %6
- movh m3, %7
- punpckl%1 m0, m2
- punpckl%1 m1, m3
- mova m2, m0
- punpckl%2 m0, m1
- punpckh%2 m2, m1
-
- movh m4, %8
- movh m6, %9
- movh m5, %10
- movh m7, %11
- punpckl%1 m4, m6
- punpckl%1 m5, m7
- mova m6, m4
- punpckl%2 m4, m5
- punpckh%2 m6, m5
-
- punpckh%3 m1, m0, m4
- punpckh%3 m3, m2, m6
- punpckl%3 m0, m4
- punpckl%3 m2, m6
-%endmacro
-
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
@@ -844,24 +815,24 @@ DEBLOCK_LUMA_INTRA
punpcklbw m2, m3
punpcklwd m1, m0, m2
punpckhwd m0, m2
- movh %1, m1
+ movd %1, m1
punpckhdq m1, m1
- movh %2, m1
- movh %3, m0
+ movd %2, m1
+ movd %3, m0
punpckhdq m0, m0
- movh %4, m0
+ movd %4, m0
punpckhdq m3, m3
punpcklbw m4, m5
punpcklbw m6, m3
punpcklwd m5, m4, m6
punpckhwd m4, m6
- movh %5, m5
+ movd %5, m5
punpckhdq m5, m5
- movh %6, m5
- movh %7, m4
+ movd %6, m5
+ movd %7, m4
punpckhdq m4, m4
- movh %8, m4
+ movd %8, m4
%endmacro
; in: 8 rows of 4 bytes in %9..%10
@@ -877,34 +848,94 @@ DEBLOCK_LUMA_INTRA
pextrd %8, %10, 3
%endmacro
-%macro TRANSPOSE4x8B_LOAD 8
- TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
-%endmacro
-
-%macro TRANSPOSE4x8W_LOAD 8
-%if mmsize==16
- TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
-%else
+; in: 4 rows of 4 words in %1..%4
+; out: 4 rows of 4 word in m0..m3
+; clobbers: m4
+%macro TRANSPOSE4x4W_LOAD 4-8
+%if mmsize==8
SWAP 1, 4, 2, 3
- mova m0, [t5]
- mova m1, [t5+r1]
- mova m2, [t5+r1*2]
- mova m3, [t5+t6]
+ movq m0, %1
+ movq m1, %2
+ movq m2, %3
+ movq m3, %4
TRANSPOSE4x4W 0, 1, 2, 3, 4
+%else
+ movq m0, %1
+ movq m2, %2
+ movq m1, %3
+ movq m3, %4
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpckldq m0, m1
+ punpckhdq m2, m1
+ movhlps m1, m0
+ movhlps m3, m2
%endif
%endmacro
-%macro TRANSPOSE8x2W_STORE 8
+; in: 2 rows of 4 words in m1..m2
+; out: 4 rows of 2 words in %1..%4
+; clobbers: m0, m1
+%macro TRANSPOSE4x2W_STORE 4-8
+%if mmsize==8
punpckhwd m0, m1, m2
punpcklwd m1, m2
-%if mmsize==8
+%else
+ punpcklwd m1, m2
+ movhlps m0, m1
+%endif
movd %3, m0
movd %1, m1
psrlq m1, 32
psrlq m0, 32
movd %2, m1
movd %4, m0
+%endmacro
+
+; in: 4/8 rows of 4 words in %1..%8
+; out: 4 rows of 4/8 word in m0..m3
+; clobbers: m4, m5, m6, m7
+%macro TRANSPOSE4x8W_LOAD 8
+%if mmsize==8
+ TRANSPOSE4x4W_LOAD %1, %2, %3, %4
+%else
+ movq m0, %1
+ movq m2, %2
+ movq m1, %3
+ movq m3, %4
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpckldq m0, m1
+ punpckhdq m2, m1
+
+ movq m4, %5
+ movq m6, %6
+ movq m5, %7
+ movq m7, %8
+ punpcklwd m4, m6
+ punpcklwd m5, m7
+ mova m6, m4
+ punpckldq m4, m5
+ punpckhdq m6, m5
+
+ punpckhqdq m1, m0, m4
+ punpckhqdq m3, m2, m6
+ punpcklqdq m0, m4
+ punpcklqdq m2, m6
+%endif
+%endmacro
+
+; in: 2 rows of 4/8 words in m1..m2
+; out: 4/8 rows of 2 words in %1..%8
+; clobbers: m0, m1
+%macro TRANSPOSE8x2W_STORE 8
+%if mmsize==8
+ TRANSPOSE4x2W_STORE %1, %2, %3, %4
%else
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
movd %5, m0
movd %1, m1
psrldq m1, 4
@@ -1118,7 +1149,7 @@ DEBLOCK_LUMA_INTRA
%endif
mova m6, [pb_1]
psubusb m4, m6 ; alpha - 1
- psubusb m5, m6 ; alpha - 2
+ psubusb m5, m6 ; beta - 1
%if %0>2
mova %3, m4
%endif
@@ -2098,17 +2129,14 @@ DEBLOCK_CHROMA
;-----------------------------------------------------------------------------
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
cglobal deblock_h_chroma_mbaff, 5,7,8
- sub r0, 4
- lea t6, [r1*3]
- mov t5, r0
- add r0, t6
- TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ CHROMA_H_START
+ TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
- TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
RET
%endmacro
@@ -2249,9 +2277,9 @@ DEBLOCK_CHROMA_INTRA
INIT_MMX mmx2
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
CHROMA_H_START
- TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body
- TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
RET
%endif ; !HIGH_BIT_DEPTH
More information about the x264-devel
mailing list