[x264-devel] x86: Faster plane_copy_deinterleave_rgb_sse2

Henrik Gramner git at videolan.org
Tue Jan 24 21:14:12 CET 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jan 12 21:36:28 2017 +0100| [da71b556730c8eb6c12a0d6950a221a4e4a99ca6] | committer: Henrik Gramner

x86: Faster plane_copy_deinterleave_rgb_sse2

50% faster than the previous SSE2 function.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=da71b556730c8eb6c12a0d6950a221a4e4a99ca6
---

 common/x86/mc-a2.asm | 58 ++++++++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index cb287c2..cacef9f 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1257,36 +1257,27 @@ cglobal load_deinterleave_chroma_fdec, 4,4
     movu        m0, [%8]
     movu        m1, [%8+%1*mmsize/4]
 %if cpuflag(ssse3)
-    pshufb      m0, m3        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    pshufb      m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
+    pshufb      m0, m3        ; a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 __ __ __ __
+    pshufb      m1, m3        ; a4 a5 a6 a7 b4 b5 b6 b7 c4 c5 c6 c7 __ __ __ __
+    SBUTTERFLY  dq, 0, 1, 2
 %elif %1 == 3
-    psrldq      m2, m0, 6
-    punpcklqdq  m0, m1        ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
-    psrldq      m1, 6
-    punpcklqdq  m2, m1        ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
-    psrlq       m3, m0, 24
-    psrlq       m4, m2, 24
-    punpckhbw   m1, m0, m3    ; b4 b5 g4 g5 r4 r5
-    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
-    punpckhbw   m3, m2, m4    ; b6 b7 g6 g7 r6 r7
-    punpcklbw   m2, m4        ; b2 b3 g2 g3 r2 r3
-    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
-%else
-    pshufd      m3, m0, q2301
-    pshufd      m4, m1, q2301
-    punpckhbw   m2, m0, m3    ; b2 b3 g2 g3 r2 r3
-    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
-    punpckhbw   m3, m1, m4    ; b6 b7 g6 g7 r6 r7
-    punpcklbw   m1, m4        ; b4 b5 g4 g5 r4 r5
-    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
-    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
-%endif
-    punpckldq   m2, m0, m1    ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
-    punpckhdq   m0, m1        ; r0 r1 r2 r3 r4 r5 r6 r7
-    movh   [r0+%9], m2
-    movhps [r2+%9], m2
-    movh   [r4+%9], m0
+    SBUTTERFLY  bw, 0, 1, 2
+    pshufd      m2, m0, q0321 ; c0 c4 a1 a5 b1 b5 c1 c5 __ __ __ __ a0 a4 b0 b4
+    punpcklbw   m3, m2, m1    ; c0 c2 c4 c6 a1 a3 a5 a7 b1 b3 b5 b7 c1 c3 c5 c7
+    punpckhbw   m2, m0        ; __ __ __ __ __ __ __ __ a0 a2 a4 a6 b0 b2 b4 b6
+    pshufd      m0, m3, q2103 ; c1 c3 c5 c7 __ __ __ __ a1 a3 a5 a7 b1 b3 b5 b7
+    punpckhbw   m2, m0        ; a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
+    punpcklbw   m3, m0        ; c0 c1 c2 c3 c4 c5 c6 c7
+    SWAP         0, 2
+    SWAP         1, 3
+%else ; %1 == 4
+    SBUTTERFLY  bw, 0, 1, 2
+    SBUTTERFLY  bw, 0, 1, 2
+    SBUTTERFLY  bw, 0, 1, 2
+%endif
+    movq   [r0+%9], m0
+    movhps [r2+%9], m0
+    movq   [r4+%9], m1
     add         %8, %1*mmsize/2
     add         %9, mmsize/2
     jl %%loopx
@@ -1338,13 +1329,6 @@ cglobal plane_copy_deinterleave_rgb, 1,7
     REP_RET
 %endmacro
 
-%if HIGH_BIT_DEPTH == 0
-INIT_XMM sse2
-PLANE_DEINTERLEAVE_RGB
-INIT_XMM ssse3
-PLANE_DEINTERLEAVE_RGB
-%endif ; !HIGH_BIT_DEPTH
-
 %macro PLANE_DEINTERLEAVE_V210 0
 ;-----------------------------------------------------------------------------
 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
@@ -1427,8 +1411,10 @@ PLANE_DEINTERLEAVE
 INIT_XMM sse2
 PLANE_INTERLEAVE
 PLANE_DEINTERLEAVE
+PLANE_DEINTERLEAVE_RGB
 INIT_XMM ssse3
 PLANE_DEINTERLEAVE
+PLANE_DEINTERLEAVE_RGB
 %endif
 
 ; These functions are not general-use; not only do the SSE ones require aligned input,



More information about the x264-devel mailing list