[x264-devel] x86: Faster AVX2 pixel_sad_x3 and pixel_sad_x4
Henrik Gramner
git at videolan.org
Fri Aug 23 23:06:32 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri Jul 5 21:15:49 2013 +0200| [ac5d88116e39bab7a63050ce62bedd5d543902ca] | committer: Jason Garrett-Glaser
x86: Faster AVX2 pixel_sad_x3 and pixel_sad_x4
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=ac5d88116e39bab7a63050ce62bedd5d543902ca
---
common/x86/sad-a.asm | 52 ++++++++++++++++++--------------------------------
encoder/me.c | 10 +++++-----
tools/checkasm.c | 2 +-
3 files changed, 25 insertions(+), 39 deletions(-)
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 46379de..1fad08a 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -32,7 +32,6 @@
SECTION_RODATA 32
pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-deinterleave_sadx4: dd 0,4,2,6
hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
@@ -1387,12 +1386,12 @@ SAD_X 4, 4, 4
vbroadcasti128 m4, [r0]
vbroadcasti128 m5, [r0+FENC_STRIDE]
movu xm0, [r1]
- movu xm1, [r3]
+ movu xm1, [r2]
movu xm2, [r1+r5]
- movu xm3, [r3+r5]
- vinserti128 m0, m0, [r2], 1
+ movu xm3, [r2+r5]
+ vinserti128 m0, m0, [r3], 1
vinserti128 m1, m1, [r4], 1
- vinserti128 m2, m2, [r2+r5], 1
+ vinserti128 m2, m2, [r3+r5], 1
vinserti128 m3, m3, [r4+r5], 1
psadbw m0, m4
psadbw m1, m4
@@ -1406,12 +1405,12 @@ SAD_X 4, 4, 4
vbroadcasti128 m6, [r0+%1]
vbroadcasti128 m7, [r0+%3]
movu xm2, [r1+%2]
- movu xm3, [r3+%2]
+ movu xm3, [r2+%2]
movu xm4, [r1+%4]
- movu xm5, [r3+%4]
- vinserti128 m2, m2, [r2+%2], 1
+ movu xm5, [r2+%4]
+ vinserti128 m2, m2, [r3+%2], 1
vinserti128 m3, m3, [r4+%2], 1
- vinserti128 m4, m4, [r2+%4], 1
+ vinserti128 m4, m4, [r3+%4], 1
vinserti128 m5, m5, [r4+%4], 1
psadbw m2, m6
psadbw m3, m6
@@ -1443,35 +1442,22 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_END_AVX2 0
- vextracti128 xm4, m0, 1
- vextracti128 xm5, m1, 1
- vextracti128 xm6, m2, 1
- paddw xm0, xm4
- paddw xm1, xm5
- paddw xm2, xm6
- movhlps xm4, xm0
- movhlps xm5, xm1
- movhlps xm6, xm2
- paddw xm0, xm4
- paddw xm1, xm5
- paddw xm2, xm6
movifnidn r5, r5mp
- movd [r5+0], xm0
- movd [r5+4], xm1
- movd [r5+8], xm2
+ packssdw m0, m1 ; 0 0 1 1 0 0 1 1
+ packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
+ phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1 ; 0 1 2 _
+ mova [r5], xm0
RET
%endmacro
%macro SAD_X4_END_AVX2 0
- mov r0, r6mp
- punpckhqdq m2, m0, m0
- punpckhqdq m3, m1, m1
- paddw m0, m2
- paddw m1, m3
- packssdw m0, m1
- mova xm2, [deinterleave_sadx4]
- vpermd m0, m2, m0
- mova [r0], xm0
+ mov r0, r6mp
+ packssdw m0, m1 ; 0 0 1 1 2 2 3 3
+ vextracti128 xm1, m0, 1
+ phaddd xm0, xm1 ; 0 1 2 3
+ mova [r0], xm0
RET
%endmacro
diff --git a/encoder/me.c b/encoder/me.c
index e87c447..30584c3 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -365,14 +365,14 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
/* hexagon */
COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
- COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 );
+ COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */
bcost <<= 3;
COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
- COPY1_IF_LT( bcost, (costs[3]<<3)+5 );
- COPY1_IF_LT( bcost, (costs[4]<<3)+6 );
- COPY1_IF_LT( bcost, (costs[5]<<3)+7 );
+ COPY1_IF_LT( bcost, (costs[4]<<3)+5 );
+ COPY1_IF_LT( bcost, (costs[5]<<3)+6 );
+ COPY1_IF_LT( bcost, (costs[6]<<3)+7 );
if( bcost&7 )
{
@@ -671,7 +671,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
for( i = 0; i < xn-2; i += 3 )
{
pixel *ref = p_fref_w+min_x+my*stride;
- int sads[3];
+ ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
{
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 8de9bac..5878bc5 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -406,7 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
} \
else \
call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
- if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
+ if( memcmp(res_c, res_asm, N*sizeof(int)) ) \
{ \
ok = 0; \
fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
More information about the x264-devel
mailing list