[x264-devel] x86: AVX2 high bit-depth predict_8x8c_h/predict_8x16c_h
Henrik Gramner
git at videolan.org
Tue Apr 23 23:03:05 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:00 2013 +0200| [a7c7038f6cc8a0e71e2288b69c60de09c43cdf02] | committer: Jason Garrett-Glaser
x86: AVX2 high bit-depth predict_8x8c_h/predict_8x16c_h
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a7c7038f6cc8a0e71e2288b69c60de09c43cdf02
---
common/x86/predict-a.asm | 113 +++++++++++++++++++++++++---------------------
common/x86/predict-c.c | 6 +++
common/x86/predict.h | 6 ++-
3 files changed, 71 insertions(+), 54 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 06245a6..10f1d9f 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -160,6 +160,44 @@ cextern pw_pixel_max
%endif
%endmacro
+%macro PRED_H_LOAD 2 ; reg, offset
+%if cpuflag(avx2)
+ vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
+%elif HIGH_BIT_DEPTH
+ movd %1, [r0+(%2)*FDEC_STRIDEB-4]
+ SPLATW %1, %1, 1
+%else
+ SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
+%endif
+%endmacro
+
+%macro PRED_H_STORE 3 ; reg, offset, width
+%assign %%w %3*SIZEOF_PIXEL
+%if %%w == 8
+ movq [r0+(%2)*FDEC_STRIDEB], %1
+%else
+ %assign %%i 0
+ %rep %%w/mmsize
+ mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
+ %assign %%i %%i+mmsize
+ %endrep
+%endif
+%endmacro
+
+%macro PRED_H_4ROWS 2 ; width, inc_ptr
+ PRED_H_LOAD m0, 0
+ PRED_H_LOAD m1, 1
+ PRED_H_STORE m0, 0, %1
+ PRED_H_STORE m1, 1, %1
+ PRED_H_LOAD m0, 2
+%if %2
+ add r0, 4*FDEC_STRIDEB
+%endif
+ PRED_H_LOAD m1, 3-4*%2
+ PRED_H_STORE m0, 2-4*%2, %1
+ PRED_H_STORE m1, 3-4*%2, %1
+%endmacro
+
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 4-5
@@ -1674,71 +1712,42 @@ PREDICT_8x16C_V
;-----------------------------------------------------------------------------
; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
- add r0, FDEC_STRIDEB*4
-%assign Y -4
-%rep %1
- movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
- SPLATW m0, m0, 1
- mova [r0+FDEC_STRIDEB*Y], m0
-%if mmsize == 8
- mova [r0+FDEC_STRIDEB*Y+8], m0
+%macro PREDICT_C_H 0
+cglobal predict_8x8c_h, 1,1
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
%endif
-%assign Y Y+1
-%endrep
+ PRED_H_4ROWS 8, 1
+ PRED_H_4ROWS 8, 0
RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
-INIT_XMM sse2
-PREDICT_C_H 8
-PREDICT_C_H 16
-
-%else ; !HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H_CORE 1
-%assign Y %1
-%rep 4
- SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
- mova [r0+FDEC_STRIDE*Y], m0
-%assign Y Y+1
-%endrep
-%endmacro
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
-%if cpuflag(ssse3)
- mova m1, [pb_3]
+cglobal predict_8x16c_h, 1,2
+%if cpuflag(ssse3) && notcpuflag(avx2)
+ mova m2, [pb_3]
%endif
-%if %1==16
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
-%endif
- add r0, FDEC_STRIDE*4
- PREDICT_C_H_CORE -4
- PREDICT_C_H_CORE 0
+ mov r1d, 4
+.loop:
+ PRED_H_4ROWS 8, 1
+ dec r1d
+ jg .loop
RET
%endmacro
INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
+PREDICT_C_H
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_C_H
+INIT_XMM avx2
+PREDICT_C_H
+%else
INIT_MMX ssse3
-PREDICT_C_H 8
-PREDICT_C_H 16
-
+PREDICT_C_H
%endif
+
;-----------------------------------------------------------------------------
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
-
%macro LOAD_LEFT 1
movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 96757ea..2c31ded 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -392,6 +392,9 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2;
#endif
#else
#if ARCH_X86_64
@@ -442,6 +445,9 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
+ if( !(cpu&X264_CPU_AVX2) )
+ return;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
diff --git a/common/x86/predict.h b/common/x86/predict.h
index ca2c20f..d3e415d 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -57,8 +57,9 @@ void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
void x264_predict_8x16c_v_sse( uint16_t *src );
void x264_predict_8x16c_h_mmx2( pixel *src );
-void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_sse2( uint16_t *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_h_avx2( uint16_t *src );
void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
@@ -72,8 +73,9 @@ void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse( uint16_t *src );
void x264_predict_8x8c_h_mmx2( pixel *src );
-void x264_predict_8x8c_h_sse2( pixel *src );
+void x264_predict_8x8c_h_sse2( uint16_t *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
+void x264_predict_8x8c_h_avx2( uint16_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
More information about the x264-devel
mailing list