[x264-devel] x86: AVX2 high bit-depth intra_sad_x3_8x8
Henrik Gramner
git at videolan.org
Mon May 20 23:06:48 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Apr 28 11:11:03 2013 +0200| [2c0bca3f798e20133f61c3517202942e873e00d6] | committer: Jason Garrett-Glaser
x86: AVX2 high bit-depth intra_sad_x3_8x8
43->24 cycles
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2c0bca3f798e20133f61c3517202942e873e00d6
---
common/pixel.c | 1 +
common/x86/pixel.h | 1 +
common/x86/sad16-a.asm | 50 ++++++++++++++++++++++++++++++++++++++++++++++++
encoder/analyse.c | 2 +-
4 files changed, 53 insertions(+), 1 deletion(-)
diff --git a/common/pixel.c b/common/pixel.c
index 2b323cb..28948b2 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1000,6 +1000,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
pixf->vsad = x264_pixel_vsad_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 555c4ec..6c06a69 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -127,6 +127,7 @@ void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 62da7cd..68fa06a 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -644,12 +644,21 @@ cglobal intra_sad_x3_8x8, 3,3,8
INTRA_SAD_HVDC_ITER 5, q2222
INTRA_SAD_HVDC_ITER 6, q1111
INTRA_SAD_HVDC_ITER 7, q0000
+%if cpuflag(ssse3)
+ phaddw m2, m3 ; 2 2 2 2 3 3 3 3
+ movhlps m3, m1
+ paddw m1, m3 ; 1 1 1 1 _ _ _ _
+ phaddw m2, m1 ; 2 2 3 3 1 1 _ _
+ pmaddwd m2, [pw_1] ; 2 3 1 _
+ mova [r2], m2
+%else
HADDW m2, m4
HADDW m3, m4
HADDW m1, m4
movd [r2+0], m2
movd [r2+4], m3
movd [r2+8], m1
+%endif
RET
%endmacro
@@ -657,3 +666,44 @@ INIT_XMM sse2
INTRA_SAD_X3_8x8
INIT_XMM ssse3
INTRA_SAD_X3_8x8
+
+%macro INTRA_SAD_HVDC_ITER_YMM 2
+ mova xm4, [r0+(%1-4)*FENC_STRIDEB]
+ vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
+ pshufd m5, m7, %2
+ psubw m5, m4
+ pabsw m5, m5
+ ACCUM paddw, 2, 5, %1 ; H
+ psubw m5, m4, m6
+ psubw m4, m0
+ pabsw m5, m5
+ pabsw m4, m4
+ ACCUM paddw, 1, 5, %1 ; V
+ ACCUM paddw, 3, 4, %1 ; DC
+%endmacro
+
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu xm0, [r1+7*SIZEOF_PIXEL]
+ vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
+ vpermq m7, m0, q0011
+ paddw xm0, xm6
+ paddw xm0, [pw_1] ; equal to +8 after HADDW
+ HADDW xm0, xm4
+ psrld xm0, 4
+ vpbroadcastw m0, xm0
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER_YMM 0, q3333
+ INTRA_SAD_HVDC_ITER_YMM 1, q2222
+ INTRA_SAD_HVDC_ITER_YMM 2, q1111
+ INTRA_SAD_HVDC_ITER_YMM 3, q0000
+ phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
+ punpckhqdq m2, m3, m3
+ paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
+ phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
+ pmaddwd xm1, [pw_1] ; 1 2 3 _
+ mova [r2], xm1
+ RET
diff --git a/encoder/analyse.c b/encoder/analyse.c
index aa90786..5c6fa52 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -888,7 +888,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
{
if( !h->mb.b_lossless && predict_mode[5] >= 0 )
{
- int satd[9];
+ ALIGNED_ARRAY_16( int32_t, satd,[9] );
h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
satd[i_pred_mode] -= 3 * lambda;
More information about the x264-devel
mailing list