[x264-devel] x86: Move predict_16x16_dc_left calculations to asm
Henrik Gramner
git at videolan.org
Tue Sep 20 20:57:52 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Sep 7 19:26:42 2016 +0200| [0c36239a4826f6e5a3cb873aca1814e389a46e29] | committer: Anton Mitrofanov
x86: Move predict_16x16_dc_left calculations to asm
1-2 cycles faster and avoids some code duplication to decrease code size.
Also drop the MMX2 implementation in favor of SSE2 to simplify things.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0c36239a4826f6e5a3cb873aca1814e389a46e29
---
common/pixel.c | 1 +
common/x86/predict-a.asm | 90 ++++++++++++++++--------------------------------
common/x86/predict-c.c | 35 -------------------
common/x86/predict.h | 11 ++----
4 files changed, 34 insertions(+), 103 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index bb59152..3963af7 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -556,6 +556,7 @@ INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#if HIGH_BIT_DEPTH
#define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
+#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c
#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 16c29ee..e8954e3 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -2092,63 +2092,28 @@ PREDICT_16x16_H
%endif
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
+; void predict_16x16_dc( pixel *src )
;-----------------------------------------------------------------------------
-%macro PRED16x16_DC_MMX 2
-%if HIGH_BIT_DEPTH
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- paddw m0, [r0 - FDEC_STRIDEB+ 8]
- paddw m0, [r0 - FDEC_STRIDEB+16]
- paddw m0, [r0 - FDEC_STRIDEB+24]
- HADDW m0, m1
- paddw m0, %1
- psrlw m0, %2
- SPLATW m0, m0
- STORE16 m0, m0, m0, m0
-%else ; !HIGH_BIT_DEPTH
- pxor m0, m0
- pxor m1, m1
- psadbw m0, [r0 - FDEC_STRIDE]
- psadbw m1, [r0 - FDEC_STRIDE + 8]
- paddusw m0, m1
- paddusw m0, %1
- psrlw m0, %2 ; dc
- pshufw m0, m0, 0
- packuswb m0, m0 ; dc in bytes
- STORE16 m0, m0
-%endif
-%endmacro
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_core, 1,2
-%if ARCH_X86_64
- movd m6, r1d
- PRED16x16_DC_MMX m6, 5
+%if WIN64
+DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
%else
- PRED16x16_DC_MMX r1m, 5
+DECLARE_REG_TMP 3
%endif
- RET
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC_MMX [pw_8], 4
- RET
-INIT_MMX mmx2
-%if HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,2
- movd m0, r1m
- SPLATW m0, m0
- STORE16 m0, m0, m0, m0
- RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
- movd m0, r1m
- pshufw m0, m0, 0
- packuswb m0, m0
- STORE16 m0, m0
+INIT_XMM
+; Returns the sum of the left pixels in r1d+r2d
+cglobal predict_16x16_dc_left_internal, 0,4
+ movzx r1d, pixel [r0-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
+%assign i 2*FDEC_STRIDEB
+%rep 7
+ movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
+ add r1d, t0d
+ movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
+ add r2d, t0d
+%assign i i+2*FDEC_STRIDEB
+%endrep
RET
-%endif
%macro PRED16x16_DC 2
%if HIGH_BIT_DEPTH
@@ -2176,9 +2141,11 @@ cglobal predict_16x16_dc_left_core, 1,1
%endif
%endmacro
-%macro PREDICT_16x16_DC_CORE 0
-cglobal predict_16x16_dc_core, 2,2,4
- movd xm3, r1m
+%macro PREDICT_16x16_DC 0
+cglobal predict_16x16_dc, 1,3
+ call predict_16x16_dc_left_internal
+ lea r1d, [r1+r2+16]
+ movd xm3, r1d
PRED16x16_DC xm3, 5
RET
@@ -2186,8 +2153,11 @@ cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
RET
-cglobal predict_16x16_dc_left_core, 1,2
- movd xm0, r1m
+cglobal predict_16x16_dc_left, 1,3
+ call predict_16x16_dc_left_internal
+ lea r1d, [r1+r2+8]
+ shr r1d, 4
+ movd xm0, r1d
SPLATW m0, xm0
%if HIGH_BIT_DEPTH && mmsize == 16
STORE16 m0, m0
@@ -2201,11 +2171,11 @@ cglobal predict_16x16_dc_left_core, 1,2
%endmacro
INIT_XMM sse2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%if HIGH_BIT_DEPTH
INIT_YMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%else
INIT_XMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%endif
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index b5a8b45..38ff39e 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -29,38 +29,6 @@
#include "predict.h"
#include "pixel.h"
-#define PREDICT_16x16_DC(name)\
-void x264_predict_16x16_dc_##name( pixel *src )\
-{\
- uint32_t dc = 16;\
- for( int i = 0; i < 16; i += 2 )\
- {\
- dc += src[-1 + i * FDEC_STRIDE];\
- dc += src[-1 + (i+1) * FDEC_STRIDE];\
- }\
- x264_predict_16x16_dc_core_##name( src, dc );\
-}
-
-PREDICT_16x16_DC( mmx2 )
-PREDICT_16x16_DC( sse2 )
-PREDICT_16x16_DC( avx2 )
-
-#define PREDICT_16x16_DC_LEFT(name)\
-static void x264_predict_16x16_dc_left_##name( pixel *src )\
-{\
- uint32_t dc = 8;\
- for( int i = 0; i < 16; i += 2 )\
- {\
- dc += src[-1 + i * FDEC_STRIDE];\
- dc += src[-1 + (i+1) * FDEC_STRIDE];\
- }\
- x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
-}
-
-PREDICT_16x16_DC_LEFT( mmx2 )
-PREDICT_16x16_DC_LEFT( sse2 )
-PREDICT_16x16_DC_LEFT( avx2 )
-
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
@@ -347,9 +315,6 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX2) )
return;
- pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2;
- pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2;
- pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
diff --git a/common/x86/predict.h b/common/x86/predict.h
index 662cc64..ba1dd6b 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -40,15 +40,10 @@ void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_16x16_h_avx2( uint16_t *src );
-void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
-void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_top_mmx2( pixel *src );
+void x264_predict_16x16_dc_avx2( pixel *src );
+void x264_predict_16x16_dc_left_sse2( pixel *src );
+void x264_predict_16x16_dc_left_avx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_avx2( pixel *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
More information about the x264-devel
mailing list