[x264-devel] x86: Faster high bit-depth intra_sad_x3_4x4
Henrik Gramner
git at videolan.org
Mon May 20 23:06:47 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed May 1 17:42:48 2013 +0200| [37edf16c1955cfc9d2843024af0fa7aa6268ad90] | committer: Jason Garrett-Glaser
x86: Faster high bit-depth intra_sad_x3_4x4
20->16 cycles on Ivy Bridge
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=37edf16c1955cfc9d2843024af0fa7aa6268ad90
---
common/x86/sad16-a.asm | 43 ++++++++++++++++++++++++-------------------
encoder/analyse.c | 2 +-
2 files changed, 25 insertions(+), 20 deletions(-)
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 7606da3..62da7cd 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -533,52 +533,57 @@ SAD_X 4, 16, 8
%macro INTRA_SAD_X3_4x4 0
cglobal intra_sad_x3_4x4, 3,3,7
- movq m0, [r1-1*FDEC_STRIDEB]
+ movddup m0, [r1-1*FDEC_STRIDEB]
movq m1, [r0+0*FENC_STRIDEB]
movq m2, [r0+2*FENC_STRIDEB]
pshuflw m6, m0, q1032
paddw m6, m0
pshuflw m5, m6, q2301
paddw m6, m5
- punpcklqdq m6, m6 ;A+B+C+D 8 times
- punpcklqdq m0, m0
+ punpcklqdq m6, m6 ; A+B+C+D 8 times
movhps m1, [r0+1*FENC_STRIDEB]
movhps m2, [r0+3*FENC_STRIDEB]
psubw m3, m1, m0
psubw m0, m2
- ABSW m3, m3, m5
- ABSW m0, m0, m5
+ ABSW2 m3, m0, m3, m0, m4, m5
paddw m0, m3
- HADDW m0, m5
- movd [r2], m0 ;V prediction cost
movd m3, [r1+0*FDEC_STRIDEB-4]
- movhps m3, [r1+1*FDEC_STRIDEB-8]
movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
movhps m4, [r1+3*FDEC_STRIDEB-8]
pshufhw m3, m3, q3333
pshufhw m4, m4, q3333
pshuflw m3, m3, q1111 ; FF FF EE EE
pshuflw m4, m4, q1111 ; HH HH GG GG
paddw m5, m3, m4
- pshufd m0, m5, q1032
+ paddw m6, [pw_4]
+ paddw m6, m5
+ pshufd m5, m5, q1032
paddw m5, m6
- paddw m5, m0
- paddw m5, [pw_4]
psrlw m5, 3
psubw m6, m5, m2
psubw m5, m1
psubw m1, m3
psubw m2, m4
- ABSW m5, m5, m0
- ABSW m6, m6, m0
- ABSW m1, m1, m0
- ABSW m2, m2, m0
+ ABSW2 m5, m6, m5, m6, m3, m4
+ ABSW2 m1, m2, m1, m2, m3, m4
paddw m5, m6
paddw m1, m2
- HADDW m5, m0
- HADDW m1, m2
- movd [r2+8], m5 ;DC prediction cost
- movd [r2+4], m1 ;H prediction cost
+%if cpuflag(ssse3)
+ phaddw m0, m1
+ movhlps m3, m5
+ paddw m5, m3
+ phaddw m0, m5
+ pmaddwd m0, [pw_1]
+ mova [r2], m0
+%else
+ HADDW m0, m3
+ HADDW m1, m3
+ HADDW m5, m3
+ movd [r2], m0 ; V prediction cost
+ movd [r2+4], m1 ; H prediction cost
+ movd [r2+8], m5 ; DC prediction cost
+%endif
RET
%endmacro
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 12b0825..aa90786 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1006,7 +1006,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
{
if( !h->mb.b_lossless && predict_mode[5] >= 0 )
{
- int satd[9];
+ ALIGNED_ARRAY_16( int32_t, satd,[9] );
h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
satd[i_pred_mode] -= 3 * lambda;
More information about the x264-devel
mailing list