[x264-devel] XOP 8-bit fDCT
Jason Garrett-Glaser
git at videolan.org
Mon Jan 16 02:11:58 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Dec 22 14:03:15 2011 -0800| [6b06f6d3f7f800dca1a4ea154f54427d5b3cea2b] | committer: Jason Garrett-Glaser
XOP 8-bit fDCT
Use integer MAC for one of the SUMSUB passes. About a dozen cycles faster for 16x16.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6b06f6d3f7f800dca1a4ea154f54427d5b3cea2b
---
common/dct.c | 6 ++++++
common/x86/const-a.asm | 1 +
common/x86/dct-32.asm | 7 ++++++-
common/x86/dct-64.asm | 7 ++++++-
common/x86/dct-a.asm | 2 ++
common/x86/dct.h | 2 ++
common/x86/x86util.asm | 21 ++++-----------------
7 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 6836bc2..c071ae4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -597,6 +597,12 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
}
+
+ if( cpu&X264_CPU_XOP )
+ {
+ dctf->sub8x8_dct = x264_sub8x8_dct_xop;
+ dctf->sub16x16_dct = x264_sub16x16_dct_xop;
+ }
#endif //HAVE_MMX
#if HAVE_ALTIVEC
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index 8ce98c6..a7cfbe8 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -38,6 +38,7 @@ const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_1, times 8 dw 1
const pw_2, times 8 dw 2
+const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_16, times 8 dw 16
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
index f439508..b0aedc8 100644
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -32,10 +32,13 @@
SECTION .text
-%ifndef HIGH_BIT_DEPTH
+cextern pw_2
+cextern pw_m2
cextern pw_32
cextern hsub_mul
+%ifndef HIGH_BIT_DEPTH
+
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
@@ -402,6 +405,8 @@ INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index e144250..dab0054 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -31,10 +31,13 @@
SECTION .text
-%ifndef HIGH_BIT_DEPTH
+cextern pw_2
+cextern pw_m2
cextern pw_32
cextern hsub_mul
+%ifndef HIGH_BIT_DEPTH
+
%macro DCT8_1D 10
SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
@@ -198,6 +201,8 @@ INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
+INIT_XMM xop
+DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 1324b48..57432f7 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -452,9 +452,11 @@ INIT_XMM
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
cextern sub8x8_dct_avx.skip_prologue
+cextern sub8x8_dct_xop.skip_prologue
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0
cextern add8x8_idct_sse2.skip_prologue
cextern add8x8_idct_avx.skip_prologue
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 3a615df..ad0e9a5 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -38,6 +38,8 @@ void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index b71ed79..097ec9c 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -586,7 +586,10 @@
%endmacro
%macro SUMSUB2_AB 4
-%ifnum %3
+%if cpuflag(xop)
+ pmacs%1%1 m%4, m%3, [p%1_m2], m%2
+ pmacs%1%1 m%2, m%2, [p%1_2], m%3
+%elifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
padd%1 m%2, m%2
@@ -600,22 +603,6 @@
%endif
%endmacro
-%macro SUMSUB2_BA 4
-%if avx_enabled
- padd%1 m%4, m%2, m%3
- padd%1 m%4, m%3
- psub%1 m%3, m%2
- psub%1 m%3, m%2
- SWAP %2, %4
-%else
- mova m%4, m%2
- padd%1 m%2, m%3
- padd%1 m%2, m%3
- psub%1 m%3, m%4
- psub%1 m%3, m%4
-%endif
-%endmacro
-
%macro SUMSUBD2_AB 5
%ifnum %4
psra%1 m%5, m%2, 1 ; %3: %3>>1
More information about the x264-devel
mailing list