[x264-devel] x86: AVX2 high bit-depth predict_8x8c_h/predict_8x16c_h

Tue Apr 23 23:03:05 CEST 2013

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:00 2013 +0200| [a7c7038f6cc8a0e71e2288b69c60de09c43cdf02] | committer: Jason Garrett-Glaser

x86: AVX2 high bit-depth predict_8x8c_h/predict_8x16c_h

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a7c7038f6cc8a0e71e2288b69c60de09c43cdf02
---

 common/x86/predict-a.asm |  113 +++++++++++++++++++++++++---------------------
 common/x86/predict-c.c   |    6 +++
 common/x86/predict.h     |    6 ++-
 3 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 06245a6..10f1d9f 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -160,6 +160,44 @@ cextern pw_pixel_max
 %endif
 %endmacro
 
+%macro PRED_H_LOAD 2 ; reg, offset
+%if cpuflag(avx2)
+    vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
+%elif HIGH_BIT_DEPTH
+    movd           %1, [r0+(%2)*FDEC_STRIDEB-4]
+    SPLATW         %1, %1, 1
+%else
+    SPLATB_LOAD    %1, r0+(%2)*FDEC_STRIDE-1, m2
+%endif
+%endmacro
+
+%macro PRED_H_STORE 3 ; reg, offset, width
+%assign %%w %3*SIZEOF_PIXEL
+%if %%w == 8
+    movq [r0+(%2)*FDEC_STRIDEB], %1
+%else
+    %assign %%i 0
+    %rep %%w/mmsize
+        mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
+    %assign %%i %%i+mmsize
+    %endrep
+%endif
+%endmacro
+
+%macro PRED_H_4ROWS 2 ; width, inc_ptr
+    PRED_H_LOAD  m0, 0
+    PRED_H_LOAD  m1, 1
+    PRED_H_STORE m0, 0, %1
+    PRED_H_STORE m1, 1, %1
+    PRED_H_LOAD  m0, 2
+%if %2
+    add          r0, 4*FDEC_STRIDEB
+%endif
+    PRED_H_LOAD  m1, 3-4*%2
+    PRED_H_STORE m0, 2-4*%2, %1
+    PRED_H_STORE m1, 3-4*%2, %1
+%endmacro
+
 ; dest, left, right, src, tmp
 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
 %macro PRED8x8_LOWPASS 4-5
@@ -1674,71 +1712,42 @@ PREDICT_8x16C_V
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_h( uint8_t *src )
 ;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
-    add        r0, FDEC_STRIDEB*4
-%assign Y -4
-%rep %1
-    movd       m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
-    SPLATW     m0, m0, 1
-    mova [r0+FDEC_STRIDEB*Y], m0
-%if mmsize == 8
-    mova [r0+FDEC_STRIDEB*Y+8], m0
+%macro PREDICT_C_H 0
+cglobal predict_8x8c_h, 1,1
+%if cpuflag(ssse3) && notcpuflag(avx2)
+    mova  m2, [pb_3]
 %endif
-%assign Y Y+1
-%endrep
+    PRED_H_4ROWS 8, 1
+    PRED_H_4ROWS 8, 0
     RET
-%endmacro
-
-INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
-INIT_XMM sse2
-PREDICT_C_H 8
-PREDICT_C_H 16
-
-%else ; !HIGH_BIT_DEPTH
-
-%macro PREDICT_C_H_CORE 1
-%assign Y %1
-%rep 4
-    SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
-    mova [r0+FDEC_STRIDE*Y], m0
-%assign Y Y+1
-%endrep
-%endmacro
 
-%macro PREDICT_C_H 1
-cglobal predict_8x%1c_h, 1,1
-%if cpuflag(ssse3)
-    mova   m1, [pb_3]
+cglobal predict_8x16c_h, 1,2
+%if cpuflag(ssse3) && notcpuflag(avx2)
+    mova  m2, [pb_3]
 %endif
-%if %1==16
-    add    r0, FDEC_STRIDE*4
-    PREDICT_C_H_CORE -4
-    add    r0, FDEC_STRIDE*4
-    PREDICT_C_H_CORE -4
-%endif
-    add    r0, FDEC_STRIDE*4
-    PREDICT_C_H_CORE -4
-    PREDICT_C_H_CORE 0
+    mov  r1d, 4
+.loop:
+    PRED_H_4ROWS 8, 1
+    dec  r1d
+    jg .loop
     RET
 %endmacro
 
 INIT_MMX mmx2
-PREDICT_C_H 8
-PREDICT_C_H 16
+PREDICT_C_H
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_C_H
+INIT_XMM avx2
+PREDICT_C_H
+%else
 INIT_MMX ssse3
-PREDICT_C_H 8
-PREDICT_C_H 16
-
+PREDICT_C_H
 %endif
+
 ;-----------------------------------------------------------------------------
 ; void predict_8x8c_dc( pixel *src )
 ;-----------------------------------------------------------------------------
-
 %macro LOAD_LEFT 1
     movzx    r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
     movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 96757ea..2c31ded 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -392,6 +392,9 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
     if( !(cpu&X264_CPU_AVX) )
         return;
     pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_avx;
+    if( !(cpu&X264_CPU_AVX2) )
+        return;
+    pf[I_PRED_CHROMA_H]   = x264_predict_8x8c_h_avx2;
 #endif
 #else
 #if ARCH_X86_64
@@ -442,6 +445,9 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
     if( !(cpu&X264_CPU_AVX) )
         return;
     pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_avx;
+    if( !(cpu&X264_CPU_AVX2) )
+        return;
+    pf[I_PRED_CHROMA_H]   = x264_predict_8x16c_h_avx2;
 #else
     pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_mmx;
     if( !(cpu&X264_CPU_MMX2) )
diff --git a/common/x86/predict.h b/common/x86/predict.h
index ca2c20f..d3e415d 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -57,8 +57,9 @@ void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x16c_v_mmx( uint8_t *src );
 void x264_predict_8x16c_v_sse( uint16_t *src );
 void x264_predict_8x16c_h_mmx2( pixel *src );
-void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_sse2( uint16_t *src );
 void x264_predict_8x16c_h_ssse3( uint8_t *src );
+void x264_predict_8x16c_h_avx2( uint16_t *src );
 void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
 void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
 void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
@@ -72,8 +73,9 @@ void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
 void x264_predict_8x8c_v_mmx( pixel *src );
 void x264_predict_8x8c_v_sse( uint16_t *src );
 void x264_predict_8x8c_h_mmx2( pixel *src );
-void x264_predict_8x8c_h_sse2( pixel *src );
+void x264_predict_8x8c_h_sse2( uint16_t *src );
 void x264_predict_8x8c_h_ssse3( uint8_t *src );
+void x264_predict_8x8c_h_avx2( uint16_t *src );
 void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
 void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );