[x264-devel] x86: AVX2 intra_sad_x3_8x8c

Jason Garrett-Glaser git at videolan.org
Mon May 20 23:06:48 CEST 2013


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Sat Apr 27 21:03:32 2013 -0700| [b79f4a6e460b00c85f0ee67b03299bf1d15dd48c] | committer: Jason Garrett-Glaser

x86: AVX2 intra_sad_x3_8x8c

30->22 cycles

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b79f4a6e460b00c85f0ee67b03299bf1d15dd48c
---

 common/pixel.c       |    1 +
 common/x86/pixel.h   |    1 +
 common/x86/sad-a.asm |   63 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/common/pixel.c b/common/pixel.c
index 28948b2..4e6730f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1304,6 +1304,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_avx2;
         pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
         pixf->intra_sad_x9_8x8  = x264_intra_sad_x9_8x8_avx2;
+        pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2;
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
 #if ARCH_X86_64
         pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 6c06a69..c41f478 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -116,6 +116,7 @@ void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
 void x264_intra_sad_x3_8x8c_mmx2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8c_ssse3  ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8c_avx2   ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_mmx2 ( pixel   *, pixel   *, int * );
 void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
 void x264_intra_sad_x3_16x16_mmx2  ( pixel   *, pixel   *, int * );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 19f76f9..ef26207 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -29,9 +29,11 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
+pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
 deinterleave_sadx4: dd 0,4,2,6
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
 
 SECTION .text
 
@@ -560,6 +562,65 @@ INTRA_SAD_8x8C
 INIT_MMX ssse3
 INTRA_SAD_8x8C
 
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8c, 3,3,7
+    vpbroadcastq m2, [r1 - FDEC_STRIDE]         ; V pred
+    add          r1, FDEC_STRIDE*4-1
+    pxor        xm5, xm5
+    punpckldq   xm3, xm2, xm5                   ; V0 _ V1 _
+    movd        xm0, [r1 + FDEC_STRIDE*-1 - 3]
+    movd        xm1, [r1 + FDEC_STRIDE* 3 - 3]
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-4], 0
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 0], 0
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-3], 1
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 1], 1
+    pinsrb      xm0, [r1 + FDEC_STRIDE*-2], 2
+    pinsrb      xm1, [r1 + FDEC_STRIDE* 2], 2
+    punpcklqdq  xm0, xm1                        ; H0 _ H1 _
+    vinserti128  m3, m3, xm0, 1                 ; V0 V1 H0 H1
+    pshufb      xm0, [hpred_shuf]               ; H00224466 H11335577
+    psadbw       m3, m5                         ; s0 s1 s2 s3
+    vpermq       m4, m3, q3312                  ; s2 s1 s3 s3
+    vpermq       m3, m3, q1310                  ; s0 s1 s3 s1
+    paddw        m3, m4
+    psrlw        m3, 2
+    pavgw        m3, m5                         ; s0+s2 s1 s3 s1+s3
+    pshufb       m3, [pb_shuf8x8c2]             ; DC0 _ DC1 _
+    vpblendd     m3, m3, m2, 11001100b          ; DC0 V DC1 V
+    vinserti128  m1, m3, xm3, 1                 ; DC0 V DC0 V
+    vperm2i128   m6, m3, m3, q0101              ; DC1 V DC1 V
+    vpermq       m0, m0, q3120                  ; H00224466 _ H11335577 _
+    movddup      m2, [r0+FENC_STRIDE*0]
+    movddup      m4, [r0+FENC_STRIDE*2]
+    pshuflw      m3, m0, q0000
+    psadbw       m3, m2
+    psadbw       m2, m1
+    pshuflw      m5, m0, q1111
+    psadbw       m5, m4
+    psadbw       m4, m1
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*4]
+    pshuflw      m5, m0, q2222
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    movddup      m4, [r0+FENC_STRIDE*6]
+    pshuflw      m5, m0, q3333
+    psadbw       m5, m4
+    psadbw       m4, m6
+    paddw        m2, m4
+    paddw        m3, m5
+    vextracti128 xm0, m2, 1
+    vextracti128 xm1, m3, 1
+    paddw       xm2, xm0 ; DC V
+    paddw       xm3, xm1 ; H
+    pextrd   [r2+8], xm2, 2 ; V
+    movd     [r2+4], xm3    ; H
+    movd     [r2+0], xm2    ; DC
+    RET
+
 
 ;-----------------------------------------------------------------------------
 ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );



More information about the x264-devel mailing list