[x264-devel] commit: Faster H asm intra prediction functions (Jason Garrett-Glaser )

git version control git at videolan.org
Fri Sep 5 22:13:23 CEST 2008


x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Sep  3 15:35:22 2008 -0700| [f4d733a90136fe28b9a9a4c7efdce71c0446aeb5] | committer: Jason Garrett-Glaser 

Faster H asm intra prediction functions
Take advantage of the H prediction method invented for merged intra SAD and apply it to regular prediction, too.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f4d733a90136fe28b9a9a4c7efdce71c0446aeb5
---

 common/x86/predict-a.asm |   84 ++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/predict-c.c   |   38 ++++++++-------------
 2 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 88afafb..0768558 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -22,6 +22,7 @@
 ;*****************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 %macro STORE8x8 2
     movq        [r0 + 0*FDEC_STRIDE], %1
@@ -66,6 +67,7 @@ SECTION_RODATA
 
 ALIGN 16
 pb_1:       times 16 db 1
+pb_3:       times 16 db 3
 pw_2:       times 4 dw 2
 pw_4:       times 4 dw 4
 pw_8:       times 8 dw 8
@@ -152,6 +154,31 @@ cglobal predict_8x8_v_mmxext, 2,2
     RET
 
 ;-----------------------------------------------------------------------------
+; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal predict_8x8_h_mmxext, 2,2
+    movu   m3, [r1+7]
+    mova   m7, m3
+    punpckhbw m3, m3
+    punpcklbw m7, m7
+    pshufw m0, m3, 0xff
+    pshufw m1, m3, 0xaa
+    pshufw m2, m3, 0x55
+    pshufw m3, m3, 0x00
+    pshufw m4, m7, 0xff
+    pshufw m5, m7, 0xaa
+    pshufw m6, m7, 0x55
+    pshufw m7, m7, 0x00
+%assign n 0
+%rep 8
+    mova [r0+n*FDEC_STRIDE], m %+ n
+%assign n n+1
+%endrep
+    RET
+
+;-----------------------------------------------------------------------------
 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
 ;-----------------------------------------------------------------------------
 cglobal predict_8x8_dc_mmxext, 2,2
@@ -368,6 +395,30 @@ cglobal predict_8x8c_v_mmx, 1,1
     RET
 
 ;-----------------------------------------------------------------------------
+; void predict_8x8c_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_8x8C_H 1
+cglobal predict_8x8c_h_%1, 1,1
+%ifidn %1, ssse3
+    mova   m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 8
+    SPLATB m0, r0+FDEC_STRIDE*n-1, m1
+    mova [r0+FDEC_STRIDE*n], m0
+%assign n n+1
+%endrep
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_8x8C_H mmxext
+%define SPLATB SPLATB_SSSE3
+PRED_8x8C_H ssse3
+
+;-----------------------------------------------------------------------------
 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
 ;-----------------------------------------------------------------------------
 cglobal predict_8x8c_dc_core_mmxext, 1,1
@@ -543,6 +594,39 @@ cglobal predict_16x16_v_sse2, 1,2
     REP_RET
 
 ;-----------------------------------------------------------------------------
+; void predict_16x16_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_16x16_H 1
+cglobal predict_16x16_h_%1, 1,2
+    mov r1, FDEC_STRIDE*12
+%ifidn %1, ssse3
+    mova   m1, [pb_3 GLOBAL]
+%endif
+.vloop:
+%assign n 0
+%rep 4
+    SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
+    mova [r0+r1+FDEC_STRIDE*n], m0
+%if mmsize==8
+    mova [r0+r1+FDEC_STRIDE*n+8], m0
+%endif
+%assign n n+1
+%endrep
+    add r1, -FDEC_STRIDE*4
+    jge .vloop
+    REP_RET
+%endmacro
+
+;no SSE2, its slower than MMX on all systems that don't support SSSE3
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_16x16_H mmxext
+INIT_XMM
+%define SPLATB SPLATB_SSSE3
+PRED_16x16_H ssse3
+
+;-----------------------------------------------------------------------------
 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
 ;-----------------------------------------------------------------------------
 
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 489bffb..d70c25a 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -26,13 +26,18 @@
 #include "pixel.h"
 
 extern void predict_16x16_v_mmx( uint8_t *src );
+extern void predict_16x16_h_mmxext( uint8_t *src );
+extern void predict_16x16_h_ssse3( uint8_t *src );
 extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
 extern void predict_16x16_dc_top_mmxext( uint8_t *src );
 extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
 extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
 extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
 extern void predict_8x8c_v_mmx( uint8_t *src );
+extern void predict_8x8c_h_mmxext( uint8_t *src );
+extern void predict_8x8c_h_ssse3( uint8_t *src );
 extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
 extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
@@ -126,28 +131,6 @@ static void predict_8x8c_dc_mmxext( uint8_t *src )
 }
 
 #ifdef ARCH_X86_64
-static void predict_16x16_h( uint8_t *src )
-{
-    int y;
-    for( y = 0; y < 16; y++ )
-    {
-        const uint64_t v = 0x0101010101010101ULL * src[-1];
-        uint64_t *p = (uint64_t*)src;
-        p[0] = p[1] = v;
-        src += FDEC_STRIDE;
-    }
-}
-
-static void predict_8x8c_h( uint8_t *src )
-{
-    int y;
-    for( y = 0; y < 8; y++ )
-    {
-        *(uint64_t*)src = 0x0101010101010101ULL * src[-1];
-        src += FDEC_STRIDE;
-    }
-}
-
 static void predict_16x16_dc_left( uint8_t *src )
 {
     uint32_t s = 0;
@@ -496,7 +479,6 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
     if( !(cpu&X264_CPU_MMX) )
         return;
 #ifdef ARCH_X86_64
-    pf[I_PRED_16x16_H]       = predict_16x16_h;
     pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
 #endif
     pf[I_PRED_16x16_V]       = predict_16x16_v_mmx;
@@ -505,6 +487,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC]      = predict_16x16_dc_mmxext;
     pf[I_PRED_16x16_DC_TOP]  = predict_16x16_dc_top_mmxext;
     pf[I_PRED_16x16_P]       = predict_16x16_p_mmxext;
+    pf[I_PRED_16x16_H]       = predict_16x16_h_mmxext;
     if( !(cpu&X264_CPU_SSE2) )
         return;
     pf[I_PRED_16x16_DC]     = predict_16x16_dc_sse2;
@@ -513,6 +496,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
         return;
     pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
     pf[I_PRED_16x16_P]      = predict_16x16_p_sse2;
+    if( !(cpu&X264_CPU_SSSE3) )
+        return;
+    pf[I_PRED_16x16_H]      = predict_16x16_h_ssse3;
 }
 
 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -520,15 +506,18 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
     if( !(cpu&X264_CPU_MMX) )
         return;
 #ifdef ARCH_X86_64
-    pf[I_PRED_CHROMA_H]       = predict_8x8c_h;
     pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
     pf[I_PRED_CHROMA_DC_TOP]  = predict_8x8c_dc_top;
 #endif
     pf[I_PRED_CHROMA_V]       = predict_8x8c_v_mmx;
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+    pf[I_PRED_CHROMA_H]       = predict_8x8c_h_mmxext;
     pf[I_PRED_CHROMA_P]       = predict_8x8c_p_mmxext;
     pf[I_PRED_CHROMA_DC]      = predict_8x8c_dc_mmxext;
+    if( !(cpu&X264_CPU_SSSE3) )
+        return;
+    pf[I_PRED_CHROMA_H]       = predict_8x8c_h_ssse3;
 }
 
 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
@@ -536,6 +525,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
     pf[I_PRED_8x8_V]   = predict_8x8_v_mmxext;
+    pf[I_PRED_8x8_H]   = predict_8x8_h_mmxext;
     pf[I_PRED_8x8_DC]  = predict_8x8_dc_mmxext;
     pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
     pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;



More information about the x264-devel mailing list