[x264-devel] XOP 8-bit fDCT

Mon Jan 16 02:11:58 CET 2012

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Dec 22 14:03:15 2011 -0800| [6b06f6d3f7f800dca1a4ea154f54427d5b3cea2b] | committer: Jason Garrett-Glaser

XOP 8-bit fDCT
Use integer MAC for one of the SUMSUB passes.  About a dozen cycles faster for 16x16.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6b06f6d3f7f800dca1a4ea154f54427d5b3cea2b
---

 common/dct.c           |    6 ++++++
 common/x86/const-a.asm |    1 +
 common/x86/dct-32.asm  |    7 ++++++-
 common/x86/dct-64.asm  |    7 ++++++-
 common/x86/dct-a.asm   |    2 ++
 common/x86/dct.h       |    2 ++
 common/x86/x86util.asm |   21 ++++-----------------
 7 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 6836bc2..c071ae4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -597,6 +597,12 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
         dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
     }
+
+    if( cpu&X264_CPU_XOP )
+    {
+        dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
+    }
 #endif //HAVE_MMX
 
 #if HAVE_ALTIVEC
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index 8ce98c6..a7cfbe8 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -38,6 +38,7 @@ const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
 const pw_1,        times 8 dw 1
 const pw_2,        times 8 dw 2
+const pw_m2,       times 8 dw -2
 const pw_4,        times 8 dw 4
 const pw_8,        times 8 dw 8
 const pw_16,       times 8 dw 16
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
index f439508..b0aedc8 100644
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -32,10 +32,13 @@
 
 SECTION .text
 
-%ifndef HIGH_BIT_DEPTH
+cextern pw_2
+cextern pw_m2
 cextern pw_32
 cextern hsub_mul
 
+%ifndef HIGH_BIT_DEPTH
+
 ; in: m0..m7
 ; out: 0,4,6 in mem, rest in regs
 %macro DCT8_1D 9
@@ -402,6 +405,8 @@ INIT_XMM ssse3
 DCT_SUB8
 INIT_XMM avx
 DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
 
 ;-----------------------------------------------------------------------------
 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index e144250..dab0054 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -31,10 +31,13 @@
 
 SECTION .text
 
-%ifndef HIGH_BIT_DEPTH
+cextern pw_2
+cextern pw_m2
 cextern pw_32
 cextern hsub_mul
 
+%ifndef HIGH_BIT_DEPTH
+
 %macro DCT8_1D 10
     SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
     SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
@@ -198,6 +201,8 @@ INIT_XMM ssse3
 DCT_SUB8
 INIT_XMM avx
 DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
 
 ;-----------------------------------------------------------------------------
 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 1324b48..57432f7 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -452,9 +452,11 @@ INIT_XMM
 cextern sub8x8_dct_sse2.skip_prologue
 cextern sub8x8_dct_ssse3.skip_prologue
 cextern sub8x8_dct_avx.skip_prologue
+cextern sub8x8_dct_xop.skip_prologue
 SUB_NxN_DCT  sub16x16_dct_sse2,  sub8x8_dct_sse2,  128, 8, 0, 0
 SUB_NxN_DCT  sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
 SUB_NxN_DCT  sub16x16_dct_avx,   sub8x8_dct_avx,   128, 8, 0, 0
+SUB_NxN_DCT  sub16x16_dct_xop,   sub8x8_dct_xop,   128, 8, 0, 0
 
 cextern add8x8_idct_sse2.skip_prologue
 cextern add8x8_idct_avx.skip_prologue
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 3a615df..ad0e9a5 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -38,6 +38,8 @@ void x264_sub8x8_dct_ssse3  ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
 void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_avx    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_avx  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_xop    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index b71ed79..097ec9c 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -586,7 +586,10 @@
 %endmacro
 
 %macro SUMSUB2_AB 4
-%ifnum %3
+%if cpuflag(xop)
+    pmacs%1%1 m%4, m%3, [p%1_m2], m%2
+    pmacs%1%1 m%2, m%2, [p%1_2], m%3
+%elifnum %3
     psub%1  m%4, m%2, m%3
     psub%1  m%4, m%3
     padd%1  m%2, m%2
@@ -600,22 +603,6 @@
 %endif
 %endmacro
 
-%macro SUMSUB2_BA 4
-%if avx_enabled
-    padd%1  m%4, m%2, m%3
-    padd%1  m%4, m%3
-    psub%1  m%3, m%2
-    psub%1  m%3, m%2
-    SWAP     %2,  %4
-%else
-    mova    m%4, m%2
-    padd%1  m%2, m%3
-    padd%1  m%2, m%3
-    psub%1  m%3, m%4
-    psub%1  m%3, m%4
-%endif
-%endmacro
-
 %macro SUMSUBD2_AB 5
 %ifnum %4
     psra%1  m%5, m%2, 1  ; %3: %3>>1