[x264-devel] x86: AVX2 intra_sad_x3_8x8c
Jason Garrett-Glaser
git at videolan.org
Mon May 20 23:06:48 CEST 2013
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Sat Apr 27 21:03:32 2013 -0700| [b79f4a6e460b00c85f0ee67b03299bf1d15dd48c] | committer: Jason Garrett-Glaser
x86: AVX2 intra_sad_x3_8x8c
30->22 cycles
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b79f4a6e460b00c85f0ee67b03299bf1d15dd48c
---
common/pixel.c | 1 +
common/x86/pixel.h | 1 +
common/x86/sad-a.asm | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/common/pixel.c b/common/pixel.c
index 28948b2..4e6730f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1304,6 +1304,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 6c06a69..c41f478 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -116,6 +116,7 @@ void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8c_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8c_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_8x8c_avx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmx2 ( pixel *, pixel *, int * );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 19f76f9..ef26207 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -29,9 +29,11 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
deinterleave_sadx4: dd 0,4,2,6
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
@@ -560,6 +562,65 @@ INTRA_SAD_8x8C
INIT_MMX ssse3
INTRA_SAD_8x8C
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+ vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
+ add r1, FDEC_STRIDE*4-1
+ pxor xm5, xm5
+ punpckldq xm3, xm2, xm5 ; V0 _ V1 _
+ movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
+ movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
+ pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
+ pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
+ pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
+ pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
+ pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
+ pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
+ punpcklqdq xm0, xm1 ; H0 _ H1 _
+ vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
+ pshufb xm0, [hpred_shuf] ; H00224466 H11335577
+ psadbw m3, m5 ; s0 s1 s2 s3
+ vpermq m4, m3, q3312 ; s2 s1 s3 s3
+ vpermq m3, m3, q1310 ; s0 s1 s3 s1
+ paddw m3, m4
+ psrlw m3, 2
+ pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
+ pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
+ vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
+ vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
+ vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
+ vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
+ movddup m2, [r0+FENC_STRIDE*0]
+ movddup m4, [r0+FENC_STRIDE*2]
+ pshuflw m3, m0, q0000
+ psadbw m3, m2
+ psadbw m2, m1
+ pshuflw m5, m0, q1111
+ psadbw m5, m4
+ psadbw m4, m1
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*4]
+ pshuflw m5, m0, q2222
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ movddup m4, [r0+FENC_STRIDE*6]
+ pshuflw m5, m0, q3333
+ psadbw m5, m4
+ psadbw m4, m6
+ paddw m2, m4
+ paddw m3, m5
+ vextracti128 xm0, m2, 1
+ vextracti128 xm1, m3, 1
+ paddw xm2, xm0 ; DC V
+ paddw xm3, xm1 ; H
+ pextrd [r2+8], xm2, 2 ; V
+ movd [r2+4], xm3 ; H
+ movd [r2+0], xm2 ; DC
+ RET
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
More information about the x264-devel
mailing list