[x264-devel] x86: AVX2 high bit-depth predict_16x16_h
Henrik Gramner
git at videolan.org
Tue Apr 23 23:03:05 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:04 2013 +0200| [23fb1d0baeab21f5b571199c39a7b85a3b8e086c] | committer: Jason Garrett-Glaser
x86: AVX2 high bit-depth predict_16x16_h
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=23fb1d0baeab21f5b571199c39a7b85a3b8e086c
---
common/common.h | 2 +-
common/x86/predict-a.asm | 43 ++++++++++---------------------------------
common/x86/predict-c.c | 3 +++
common/x86/predict.h | 1 +
tools/checkasm.c | 2 +-
5 files changed, 16 insertions(+), 35 deletions(-)
diff --git a/common/common.h b/common/common.h
index ecd743b..53a6ff0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -753,7 +753,7 @@ struct x264_t
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 10f1d9f..f255ef1 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -2031,46 +2031,23 @@ cglobal predict_16x16_v, 1,1
;-----------------------------------------------------------------------------
%macro PREDICT_16x16_H 0
cglobal predict_16x16_h, 1,2
- mov r1, 12*FDEC_STRIDEB
-%if HIGH_BIT_DEPTH
-.vloop:
-%assign Y 0
-%rep 4
- movd m0, [r0+r1+Y*FDEC_STRIDEB-2*SIZEOF_PIXEL]
- SPLATW m0, m0, 1
- mova [r0+r1+Y*FDEC_STRIDEB+ 0], m0
- mova [r0+r1+Y*FDEC_STRIDEB+16], m0
-%if mmsize==8
- mova [r0+r1+Y*FDEC_STRIDEB+ 8], m0
- mova [r0+r1+Y*FDEC_STRIDEB+24], m0
-%endif
-%assign Y Y+1
-%endrep
-
-%else ; !HIGH_BIT_DEPTH
-%if cpuflag(ssse3)
- mova m1, [pb_3]
-%endif
-.vloop:
-%assign Y 0
-%rep 4
- SPLATB_LOAD m0, r0+r1+FDEC_STRIDE*Y-1, m1
- mova [r0+r1+FDEC_STRIDE*Y], m0
-%if mmsize==8
- mova [r0+r1+FDEC_STRIDE*Y+8], m0
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
%endif
-%assign Y Y+1
-%endrep
-%endif ; HIGH_BIT_DEPTH
- sub r1, 4*FDEC_STRIDEB
- jge .vloop
+ mov r1d, 4
+.loop:
+ PRED_H_4ROWS 16, 1
+ dec r1d
+ jg .loop
RET
%endmacro
INIT_MMX mmx2
PREDICT_16x16_H
-INIT_XMM sse2
%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_16x16_H
+INIT_YMM avx2
PREDICT_16x16_H
%else
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 2c31ded..871e8af 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -341,6 +341,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
#if HAVE_X86_INLINE_ASM
pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
#endif
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2;
#else
#if !ARCH_X86_64
pf[I_PRED_16x16_P] = x264_predict_16x16_p_mmx2;
diff --git a/common/x86/predict.h b/common/x86/predict.h
index d3e415d..8f10721 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -38,6 +38,7 @@ void x264_predict_16x16_v_sse ( pixel *src );
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
+void x264_predict_16x16_h_avx2( uint16_t *src );
void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index f871d58..33c2378 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2147,7 +2147,7 @@ static int check_intra( int cpu_ref, int cpu_new )
int ret = 0, ok = 1, used_asm = 0;
ALIGNED_ARRAY_32( pixel, edge,[36] );
ALIGNED_ARRAY_32( pixel, edge2,[36] );
- ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
+ ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
struct
{
x264_predict_t predict_16x16[4+3];
More information about the x264-devel
mailing list