[x264-devel] x86: AVX2 predict_16x16_dc
Henrik Gramner
git at videolan.org
Tue Apr 23 23:29:26 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:25 2013 +0200| [6685eb86ad233d1b38cce2d4f55dff778b54af17] | committer: Jason Garrett-Glaser
x86: AVX2 predict_16x16_dc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6685eb86ad233d1b38cce2d4f55dff778b54af17
---
common/x86/predict-a.asm | 65 +++++++++++++++++++++++++---------------------
common/x86/predict-c.c | 5 ++++
common/x86/predict.h | 4 ++-
3 files changed, 44 insertions(+), 30 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 5ccccb5..b311cdf 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -2094,8 +2094,7 @@ PREDICT_16x16_H
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core( pixel *src, int i_dc_left )
;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC 2
+%macro PRED16x16_DC_MMX 2
%if HIGH_BIT_DEPTH
mova m0, [r0 - FDEC_STRIDEB+ 0]
paddw m0, [r0 - FDEC_STRIDEB+ 8]
@@ -2124,15 +2123,15 @@ INIT_MMX mmx2
cglobal predict_16x16_dc_core, 1,2
%if ARCH_X86_64
movd m6, r1d
- PRED16x16_DC m6, 5
+ PRED16x16_DC_MMX m6, 5
%else
- PRED16x16_DC r1m, 5
+ PRED16x16_DC_MMX r1m, 5
%endif
RET
INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC [pw_8], 4
+ PRED16x16_DC_MMX [pw_8], 4
RET
INIT_MMX mmx2
@@ -2151,19 +2150,19 @@ cglobal predict_16x16_dc_left_core, 1,1
RET
%endif
-;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
-;-----------------------------------------------------------------------------
-
-%macro PRED16x16_DC_SSE2 2
+%macro PRED16x16_DC 2
%if HIGH_BIT_DEPTH
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- paddw m0, [r0 - FDEC_STRIDEB+16]
- HADDW m0, m2
- paddw m0, %1
- psrlw m0, %2
- SPLATW m0, m0
+ mova xm0, [r0 - FDEC_STRIDEB+ 0]
+ paddw xm0, [r0 - FDEC_STRIDEB+16]
+ HADDW xm0, xm2
+ paddw xm0, %1
+ psrlw xm0, %2
+ SPLATW m0, xm0
+%if mmsize == 32
+ STORE16 m0
+%else
STORE16 m0, m0
+%endif
%else ; !HIGH_BIT_DEPTH
pxor m0, m0
psadbw m0, [r0 - FDEC_STRIDE]
@@ -2177,28 +2176,36 @@ cglobal predict_16x16_dc_left_core, 1,1
%endif
%endmacro
-INIT_XMM sse2
+%macro PREDICT_16x16_DC_CORE 0
cglobal predict_16x16_dc_core, 2,2,4
- movd m3, r1m
- PRED16x16_DC_SSE2 m3, 5
+ movd xm3, r1m
+ PRED16x16_DC xm3, 5
RET
cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC_SSE2 [pw_8], 4
+ PRED16x16_DC [pw_8], 4
RET
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
- movd m0, r1m
- SPLATW m0, m0
+ movd xm0, r1m
+ SPLATW m0, xm0
+%if HIGH_BIT_DEPTH && mmsize == 16
STORE16 m0, m0
- RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
- movd m0, r1m
- SPLATW m0, m0
+%else
+%if HIGH_BIT_DEPTH == 0
packuswb m0, m0
+%endif
STORE16 m0
+%endif
RET
+%endmacro
+
+INIT_XMM sse2
+PREDICT_16x16_DC_CORE
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+PREDICT_16x16_DC_CORE
+%else
+INIT_XMM avx2
+PREDICT_16x16_DC_CORE
%endif
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index eccf86b..751b7ca 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -43,6 +43,7 @@ void x264_predict_16x16_dc_##name( pixel *src )\
PREDICT_16x16_DC( mmx2 )
PREDICT_16x16_DC( sse2 )
+PREDICT_16x16_DC( avx2 )
#define PREDICT_16x16_DC_LEFT(name)\
static void x264_predict_16x16_dc_left_##name( pixel *src )\
@@ -58,6 +59,7 @@ static void x264_predict_16x16_dc_left_##name( pixel *src )\
PREDICT_16x16_DC_LEFT( mmx2 )
PREDICT_16x16_DC_LEFT( sse2 )
+PREDICT_16x16_DC_LEFT( avx2 )
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
@@ -381,6 +383,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
if( cpu&X264_CPU_AVX2 )
{
pf[I_PRED_16x16_P] = x264_predict_16x16_p_avx2;
+ pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2;
+ pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
}
}
diff --git a/common/x86/predict.h b/common/x86/predict.h
index 7691c09..25ba25d 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -44,11 +44,13 @@ void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
+void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_top_mmx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
-void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
+void x264_predict_16x16_dc_top_avx2( pixel *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
More information about the x264-devel
mailing list