[x264-devel] commit: x86 asm for high-bit-depth DCT (Oskar Arvidsson )

Fri Nov 19 23:50:11 CET 2010

x264 | branch: master | Oskar Arvidsson <oskar at irock.se> | Sat Oct 30 16:55:48 2010 +0200| [82a3dc116318d5594ac3474112cfe3472ca1b31e] | committer: Jason Garrett-Glaser 

x86 asm for high-bit-depth DCT
Only MMX and DCT done so far; iDCT still needs asm as well.
~4.4x faster than C.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=82a3dc116318d5594ac3474112cfe3472ca1b31e
---

 common/dct.c           |   13 +++++++++++--
 common/x86/dct-a.asm   |   36 ++++++++++++++++++++++++++++++++++--
 common/x86/dct.h       |    6 +++---
 common/x86/x86util.asm |   16 +++++++++++++++-
 4 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index f4843b3..4b9528c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -421,7 +421,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->dct4x4dc  = dct4x4dc;
     dctf->idct4x4dc = idct4x4dc;
 
-#if !X264_HIGH_BIT_DEPTH
+#if X264_HIGH_BIT_DEPTH
+#if HAVE_MMX
+    if( cpu&X264_CPU_MMX )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
+        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
+    }
+#endif // HAVE_MMX
+#else // !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -519,7 +528,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
     }
 #endif
-#endif // !X264_HIGH_BIT_DEPTH
+#endif // X264_HIGH_BIT_DEPTH
 }
 
 void x264_dct_init_weights( void )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 1566528..ec83335 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -114,6 +114,27 @@ cglobal idct4x4dc_mmx, 1,1
     movq  [r0+24], m3
     RET
 
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void sub4x4_dct( int32_t dct[4][4], uint16_t *pix1, uint16_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal sub4x4_dct_mmx, 3,3
+.skip_prologue:
+    LOAD_DIFF  m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+    LOAD_DIFF  m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+    LOAD_DIFF  m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+    LOAD_DIFF  m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+    DCT4_1D 0,1,2,3,4
+    TRANSPOSE4x4W 0,1,2,3,4
+    DCT4_1D 0,1,2,3,4
+    STORE_DIFF m0, m4, m5, [r0+ 0], [r0+ 8]
+    STORE_DIFF m1, m4, m5, [r0+16], [r0+24]
+    STORE_DIFF m2, m4, m5, [r0+32], [r0+40]
+    STORE_DIFF m3, m4, m5, [r0+48], [r0+56]
+    RET
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
 %macro SUB_DCT4 1
 ;-----------------------------------------------------------------------------
 ; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
@@ -141,7 +162,9 @@ cglobal sub4x4_dct_%1, 3,3
 
 SUB_DCT4 mmx
 SUB_DCT4 ssse3
+%endif ; !X264_HIGH_BIT_DEPTH
 
+%ifndef X264_HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 ; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 ;-----------------------------------------------------------------------------
@@ -213,19 +236,22 @@ cglobal add4x4_idct_sse4, 2,2,6
     movd     [r0+FDEC_STRIDE*2], m0
     pextrd   [r0+FDEC_STRIDE*3], m0, 1
     RET
+%endif ; !X264_HIGH_BIT_DEPTH
 
 INIT_MMX
 ;-----------------------------------------------------------------------------
 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 ;-----------------------------------------------------------------------------
 %macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11
+cglobal %1, 3,3,11*(mmsize/16)
+%ifndef X264_HIGH_BIT_DEPTH
 %if mmsize == 8
     pxor m7, m7
 %else
     add r2, 4*FDEC_STRIDE
     mova m7, [hsub_mul]
 %endif
+%endif ; !X264_HIGH_BIT_DEPTH
 .skip_prologue:
 %ifdef WIN64
     sub  rsp, 8
@@ -255,7 +281,7 @@ cglobal %1, 3,3,11
 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
 ;-----------------------------------------------------------------------------
 %macro ADD_NxN_IDCT 6-7
-cglobal %1, 2,2,11
+cglobal %1, 2,2,11*(mmsize/16)
     pxor m7, m7
 %if mmsize==16
     add  r0, 4*FDEC_STRIDE
@@ -282,6 +308,11 @@ cglobal %1, 2,2,11
 %endif
 %endmacro
 
+%ifdef X264_HIGH_BIT_DEPTH
+INIT_MMX
+SUB_NxN_DCT  sub8x8_dct_mmx,    sub4x4_dct_mmx.skip_prologue,  64,  8, 0, 0
+SUB_NxN_DCT  sub16x16_dct_mmx,  sub8x8_dct_mmx.skip_prologue,  64, 16, 8, 8
+%else ; !X264_HIGH_BIT_DEPTH
 %ifndef ARCH_X86_64
 SUB_NxN_DCT  sub8x8_dct_mmx,    sub4x4_dct_mmx.skip_prologue,  32, 4, 0, 0
 ADD_NxN_IDCT add8x8_idct_mmx,   add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
@@ -310,6 +341,7 @@ ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
 
 cextern sub8x8_dct8_ssse3.skip_prologue
 SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+%endif ; X264_HIGH_BIT_DEPTH
 
 
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index b943a61..58b9d17 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -28,9 +28,9 @@
 #ifndef X264_I386_DCT_H
 #define X264_I386_DCT_H
 
-void x264_sub4x4_dct_mmx      ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_mmx      ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_mmx    ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_mmx      ( dctcoef dct    [16], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x8_dct_mmx      ( dctcoef dct[ 4][16], pixel   *pix1, pixel   *pix2 );
+void x264_sub16x16_dct_mmx    ( dctcoef dct[16][16], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x8_dct_sse2     ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_sse2   ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub4x4_dct_ssse3    ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 720be56..c49895d 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -471,7 +471,10 @@
 
 
 %macro LOAD_DIFF 5
-%ifidn %3, none
+%ifdef X264_HIGH_BIT_DEPTH
+    mova       %1, %4
+    psubw      %1, %5
+%elifidn %3, none
     movh       %1, %4
     movh       %2, %5
     punpcklbw  %1, %2
@@ -557,6 +560,16 @@
     packuswb   %2, %1
 %endmacro
 
+%ifdef X264_HIGH_BIT_DEPTH
+%macro STORE_DIFF 5
+    punpcklwd  %2, %1
+    punpckhwd  %3, %1
+    psrad      %2, 16
+    psrad      %3, 16
+    mova       %4, %2
+    mova       %5, %3
+%endmacro
+%else
 %macro STORE_DIFF 4
     movh       %2, %4
     punpcklbw  %2, %3
@@ -565,6 +578,7 @@
     packuswb   %1, %1
     movh       %4, %1
 %endmacro
+%endif
 
 %macro CLIPW 3 ;(dst, min, max)
     pmaxsw %1, %2