[x264-devel] x86: AVX2 plane_copy_deinterleave_rgb
Henrik Gramner
git at videolan.org
Tue Jan 24 21:14:12 CET 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jan 12 22:16:53 2017 +0100| [f4890275ca6523dfe5b4ae60279ae8597d9dbd4b] | committer: Henrik Gramner
x86: AVX2 plane_copy_deinterleave_rgb
Around 15% faster than SSSE3.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f4890275ca6523dfe5b4ae60279ae8597d9dbd4b
---
common/x86/mc-a2.asm | 45 +++++++++++++++++++++++++++++++++++++--------
common/x86/mc-c.c | 5 +++++
2 files changed, 42 insertions(+), 8 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index cacef9f..07ecc74 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -52,8 +52,8 @@ deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
%else
copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
-deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
- db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
+deinterleave_rgb_shuf: db 0, 3, 6, 9, 0, 3, 6, 9, 1, 4, 7,10, 2, 5, 8,11
+ db 0, 4, 8,12, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14
deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
@@ -96,6 +96,7 @@ cextern pw_0to15
cextern pd_8
cextern pd_0123
cextern pd_ffff
+cextern deinterleave_shufd
%macro LOAD_ADD 4
movh %4, %3
@@ -1247,19 +1248,41 @@ cglobal load_deinterleave_chroma_fdec, 4,4
%endmacro ; PLANE_DEINTERLEAVE
%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
-%if cpuflag(ssse3)
+%if mmsize == 32
+ vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16]
+%elif cpuflag(ssse3)
mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
%endif
%%loopy:
mov %8, r6
mov %9, %6
%%loopx:
+%if mmsize == 32 && %1 == 3
+ movu xm0, [%8+0*12]
+ vinserti128 m0, m0, [%8+1*12], 1
+ movu xm1, [%8+2*12]
+ vinserti128 m1, m1, [%8+3*12], 1
+%else
movu m0, [%8]
movu m1, [%8+%1*mmsize/4]
+%endif
%if cpuflag(ssse3)
- pshufb m0, m3 ; a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 __ __ __ __
- pshufb m1, m3 ; a4 a5 a6 a7 b4 b5 b6 b7 c4 c5 c6 c7 __ __ __ __
+ pshufb m0, m3 ; a0 a1 a2 a3 a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3
+ pshufb m1, m3 ; a4 a5 a6 a7 a4 a5 a6 a7 b4 b5 b6 b7 c4 c5 c6 c7
+%if mmsize == 32
+ vpblendd m2, m0, m1, 0x22
+ punpckhdq m0, m1
+ vpermd m2, m4, m2
+ vpermd m0, m4, m0
+ mova [r0+%9], xm2
+ mova [r2+%9], xm0
+ vextracti128 [r4+%9], m0, 1
+%else
SBUTTERFLY dq, 0, 1, 2
+ movq [r0+%9], m0
+ movq [r2+%9], m1
+ movhps [r4+%9], m1
+%endif
%elif %1 == 3
SBUTTERFLY bw, 0, 1, 2
pshufd m2, m0, q0321 ; c0 c4 a1 a5 b1 b5 c1 c5 __ __ __ __ a0 a4 b0 b4
@@ -1268,16 +1291,17 @@ cglobal load_deinterleave_chroma_fdec, 4,4
pshufd m0, m3, q2103 ; c1 c3 c5 c7 __ __ __ __ a1 a3 a5 a7 b1 b3 b5 b7
punpckhbw m2, m0 ; a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
punpcklbw m3, m0 ; c0 c1 c2 c3 c4 c5 c6 c7
- SWAP 0, 2
- SWAP 1, 3
+ movq [r0+%9], m2
+ movhps [r2+%9], m2
+ movq [r4+%9], m3
%else ; %1 == 4
SBUTTERFLY bw, 0, 1, 2
SBUTTERFLY bw, 0, 1, 2
SBUTTERFLY bw, 0, 1, 2
-%endif
movq [r0+%9], m0
movhps [r2+%9], m0
movq [r4+%9], m1
+%endif
add %8, %1*mmsize/2
add %9, mmsize/2
jl %%loopx
@@ -1319,6 +1343,9 @@ cglobal plane_copy_deinterleave_rgb, 1,7
mov r9m, r1
mov r1, r10m
%endif
+%if mmsize == 32
+ mova m4, [deinterleave_shufd]
+%endif
cmp dword r8m, 4
je .pw4
PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
@@ -1415,6 +1442,8 @@ PLANE_DEINTERLEAVE_RGB
INIT_XMM ssse3
PLANE_DEINTERLEAVE
PLANE_DEINTERLEAVE_RGB
+INIT_YMM avx2
+PLANE_DEINTERLEAVE_RGB
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index f1a3037..29c57dd 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -119,6 +119,10 @@ void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
+ pixel *dstb, intptr_t i_dstb,
+ pixel *dstc, intptr_t i_dstc,
+ pixel *src, intptr_t i_src, int pw, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
@@ -826,6 +830,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = x264_integral_init8h_avx2;
pf->integral_init4h = x264_integral_init4h_avx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
+ pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
#endif // HIGH_BIT_DEPTH
More information about the x264-devel
mailing list