[x264-devel] x86: AVX2 high bit-depth intra_sad_x3_8x8

Henrik Gramner git at videolan.org
Mon May 20 23:06:48 CEST 2013


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Apr 28 11:11:03 2013 +0200| [2c0bca3f798e20133f61c3517202942e873e00d6] | committer: Jason Garrett-Glaser

x86: AVX2 high bit-depth intra_sad_x3_8x8

43->24 cycles

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2c0bca3f798e20133f61c3517202942e873e00d6
---

 common/pixel.c         |    1 +
 common/x86/pixel.h     |    1 +
 common/x86/sad16-a.asm |   50 ++++++++++++++++++++++++++++++++++++++++++++++++
 encoder/analyse.c      |    2 +-
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/common/pixel.c b/common/pixel.c
index 2b323cb..28948b2 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1000,6 +1000,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT2( sad_x4, _avx2 );
         pixf->vsad = x264_pixel_vsad_avx2;
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
+        pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
     }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 555c4ec..6c06a69 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -127,6 +127,7 @@ void x264_intra_sa8d_x3_8x8_sse2   ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_mmx2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_sse2    ( pixel   *, pixel   *, int * );
 void x264_intra_sad_x3_8x8_ssse3   ( pixel   *, pixel   *, int * );
+void x264_intra_sad_x3_8x8_avx2    ( uint16_t*, uint16_t*, int * );
 int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
 int x264_intra_satd_x9_4x4_avx  ( uint8_t *, uint8_t *, uint16_t * );
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 62da7cd..68fa06a 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -644,12 +644,21 @@ cglobal intra_sad_x3_8x8, 3,3,8
     INTRA_SAD_HVDC_ITER 5, q2222
     INTRA_SAD_HVDC_ITER 6, q1111
     INTRA_SAD_HVDC_ITER 7, q0000
+%if cpuflag(ssse3)
+    phaddw      m2, m3     ; 2 2 2 2 3 3 3 3
+    movhlps     m3, m1
+    paddw       m1, m3     ; 1 1 1 1 _ _ _ _
+    phaddw      m2, m1     ; 2 2 3 3 1 1 _ _
+    pmaddwd     m2, [pw_1] ; 2 3 1 _
+    mova      [r2], m2
+%else
     HADDW       m2, m4
     HADDW       m3, m4
     HADDW       m1, m4
     movd    [r2+0], m2
     movd    [r2+4], m3
     movd    [r2+8], m1
+%endif
     RET
 %endmacro
 
@@ -657,3 +666,44 @@ INIT_XMM sse2
 INTRA_SAD_X3_8x8
 INIT_XMM ssse3
 INTRA_SAD_X3_8x8
+
+%macro INTRA_SAD_HVDC_ITER_YMM 2
+    mova       xm4, [r0+(%1-4)*FENC_STRIDEB]
+    vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
+    pshufd      m5, m7, %2
+    psubw       m5, m4
+    pabsw       m5, m5
+    ACCUM    paddw, 2, 5, %1 ; H
+    psubw       m5, m4, m6
+    psubw       m4, m0
+    pabsw       m5, m5
+    pabsw       m4, m4
+    ACCUM    paddw, 1, 5, %1 ; V
+    ACCUM    paddw, 3, 4, %1 ; DC
+%endmacro
+
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8, 3,3,8
+    add            r0, 4*FENC_STRIDEB
+    movu          xm0, [r1+7*SIZEOF_PIXEL]
+    vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
+    vpermq         m7, m0, q0011
+    paddw         xm0, xm6
+    paddw         xm0, [pw_1] ; equal to +8 after HADDW
+    HADDW         xm0, xm4
+    psrld         xm0, 4
+    vpbroadcastw   m0, xm0
+    punpcklwd      m7, m7
+    INTRA_SAD_HVDC_ITER_YMM 0, q3333
+    INTRA_SAD_HVDC_ITER_YMM 1, q2222
+    INTRA_SAD_HVDC_ITER_YMM 2, q1111
+    INTRA_SAD_HVDC_ITER_YMM 3, q0000
+    phaddw         m1, m2     ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
+    punpckhqdq     m2, m3, m3
+    paddw          m3, m2     ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
+    phaddw         m1, m3     ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
+    vextracti128  xm2, m1, 1
+    paddw         xm1, xm2    ; 1 1 2 2 3 3 _ _
+    pmaddwd       xm1, [pw_1] ; 1 2 3 _
+    mova         [r2], xm1
+    RET
diff --git a/encoder/analyse.c b/encoder/analyse.c
index aa90786..5c6fa52 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -888,7 +888,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             {
                 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
                 {
-                    int satd[9];
+                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
                     h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
                     int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
                     satd[i_pred_mode] -= 3 * lambda;



More information about the x264-devel mailing list