[x264-devel] commit: Faster H asm intra prediction functions (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Sep 5 22:13:23 CEST 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Sep 3 15:35:22 2008 -0700| [f4d733a90136fe28b9a9a4c7efdce71c0446aeb5] | committer: Jason Garrett-Glaser
Faster H asm intra prediction functions
Take advantage of the H prediction method invented for merged intra SAD and apply it to regular prediction, too.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f4d733a90136fe28b9a9a4c7efdce71c0446aeb5
---
common/x86/predict-a.asm | 84 ++++++++++++++++++++++++++++++++++++++++++++++
common/x86/predict-c.c | 38 ++++++++-------------
2 files changed, 98 insertions(+), 24 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 88afafb..0768558 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -22,6 +22,7 @@
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
%macro STORE8x8 2
movq [r0 + 0*FDEC_STRIDE], %1
@@ -66,6 +67,7 @@ SECTION_RODATA
ALIGN 16
pb_1: times 16 db 1
+pb_3: times 16 db 3
pw_2: times 4 dw 2
pw_4: times 4 dw 4
pw_8: times 8 dw 8
@@ -152,6 +154,31 @@ cglobal predict_8x8_v_mmxext, 2,2
RET
;-----------------------------------------------------------------------------
+; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal predict_8x8_h_mmxext, 2,2
+ movu m3, [r1+7]
+ mova m7, m3
+ punpckhbw m3, m3
+ punpcklbw m7, m7
+ pshufw m0, m3, 0xff
+ pshufw m1, m3, 0xaa
+ pshufw m2, m3, 0x55
+ pshufw m3, m3, 0x00
+ pshufw m4, m7, 0xff
+ pshufw m5, m7, 0xaa
+ pshufw m6, m7, 0x55
+ pshufw m7, m7, 0x00
+%assign n 0
+%rep 8
+ mova [r0+n*FDEC_STRIDE], m %+ n
+%assign n n+1
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
cglobal predict_8x8_dc_mmxext, 2,2
@@ -368,6 +395,30 @@ cglobal predict_8x8c_v_mmx, 1,1
RET
;-----------------------------------------------------------------------------
+; void predict_8x8c_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_8x8C_H 1
+cglobal predict_8x8c_h_%1, 1,1
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 8
+ SPLATB m0, r0+FDEC_STRIDE*n-1, m1
+ mova [r0+FDEC_STRIDE*n], m0
+%assign n n+1
+%endrep
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_8x8C_H mmxext
+%define SPLATB SPLATB_SSSE3
+PRED_8x8C_H ssse3
+
+;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
cglobal predict_8x8c_dc_core_mmxext, 1,1
@@ -543,6 +594,39 @@ cglobal predict_16x16_v_sse2, 1,2
REP_RET
;-----------------------------------------------------------------------------
+; void predict_16x16_h_mmxext( uint8_t *src )
+;-----------------------------------------------------------------------------
+
+%macro PRED_16x16_H 1
+cglobal predict_16x16_h_%1, 1,2
+ mov r1, FDEC_STRIDE*12
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+.vloop:
+%assign n 0
+%rep 4
+ SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
+ mova [r0+r1+FDEC_STRIDE*n], m0
+%if mmsize==8
+ mova [r0+r1+FDEC_STRIDE*n+8], m0
+%endif
+%assign n n+1
+%endrep
+ add r1, -FDEC_STRIDE*4
+ jge .vloop
+ REP_RET
+%endmacro
+
+;no SSE2, its slower than MMX on all systems that don't support SSSE3
+INIT_MMX
+%define SPLATB SPLATB_MMX
+PRED_16x16_H mmxext
+INIT_XMM
+%define SPLATB SPLATB_SSSE3
+PRED_16x16_H ssse3
+
+;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 489bffb..d70c25a 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -26,13 +26,18 @@
#include "pixel.h"
extern void predict_16x16_v_mmx( uint8_t *src );
+extern void predict_16x16_h_mmxext( uint8_t *src );
+extern void predict_16x16_h_ssse3( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src );
extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
extern void predict_8x8c_v_mmx( uint8_t *src );
+extern void predict_8x8c_h_mmxext( uint8_t *src );
+extern void predict_8x8c_h_ssse3( uint8_t *src );
extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
@@ -126,28 +131,6 @@ static void predict_8x8c_dc_mmxext( uint8_t *src )
}
#ifdef ARCH_X86_64
-static void predict_16x16_h( uint8_t *src )
-{
- int y;
- for( y = 0; y < 16; y++ )
- {
- const uint64_t v = 0x0101010101010101ULL * src[-1];
- uint64_t *p = (uint64_t*)src;
- p[0] = p[1] = v;
- src += FDEC_STRIDE;
- }
-}
-
-static void predict_8x8c_h( uint8_t *src )
-{
- int y;
- for( y = 0; y < 8; y++ )
- {
- *(uint64_t*)src = 0x0101010101010101ULL * src[-1];
- src += FDEC_STRIDE;
- }
-}
-
static void predict_16x16_dc_left( uint8_t *src )
{
uint32_t s = 0;
@@ -496,7 +479,6 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_16x16_H] = predict_16x16_h;
pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
#endif
pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
@@ -505,6 +487,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
+ pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
@@ -513,6 +496,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
return;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+ pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
}
void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -520,15 +506,18 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
- pf[I_PRED_CHROMA_H] = predict_8x8c_h;
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top;
#endif
pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
+ pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+ pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
@@ -536,6 +525,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
+ pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
More information about the x264-devel
mailing list