[x264-devel] commit: x86 asm for high-bit-depth DCT (Oskar Arvidsson )
git at videolan.org
git at videolan.org
Fri Nov 19 23:50:11 CET 2010
x264 | branch: master | Oskar Arvidsson <oskar at irock.se> | Sat Oct 30 16:55:48 2010 +0200| [82a3dc116318d5594ac3474112cfe3472ca1b31e] | committer: Jason Garrett-Glaser
x86 asm for high-bit-depth DCT
Only MMX and DCT done so far; iDCT still needs asm as well.
~4.4x faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=82a3dc116318d5594ac3474112cfe3472ca1b31e
---
common/dct.c | 13 +++++++++++--
common/x86/dct-a.asm | 36 ++++++++++++++++++++++++++++++++++--
common/x86/dct.h | 6 +++---
common/x86/x86util.asm | 16 +++++++++++++++-
4 files changed, 63 insertions(+), 8 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index f4843b3..4b9528c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -421,7 +421,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->dct4x4dc = dct4x4dc;
dctf->idct4x4dc = idct4x4dc;
-#if !X264_HIGH_BIT_DEPTH
+#if X264_HIGH_BIT_DEPTH
+#if HAVE_MMX
+ if( cpu&X264_CPU_MMX )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
+ dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
+ dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
+ }
+#endif // HAVE_MMX
+#else // !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
@@ -519,7 +528,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
}
#endif
-#endif // !X264_HIGH_BIT_DEPTH
+#endif // X264_HIGH_BIT_DEPTH
}
void x264_dct_init_weights( void )
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 1566528..ec83335 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -114,6 +114,27 @@ cglobal idct4x4dc_mmx, 1,1
movq [r0+24], m3
RET
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void sub4x4_dct( int32_t dct[4][4], uint16_t *pix1, uint16_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal sub4x4_dct_mmx, 3,3
+.skip_prologue:
+ LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ DCT4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+ DCT4_1D 0,1,2,3,4
+ STORE_DIFF m0, m4, m5, [r0+ 0], [r0+ 8]
+ STORE_DIFF m1, m4, m5, [r0+16], [r0+24]
+ STORE_DIFF m2, m4, m5, [r0+32], [r0+40]
+ STORE_DIFF m3, m4, m5, [r0+48], [r0+56]
+ RET
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
@@ -141,7 +162,9 @@ cglobal sub4x4_dct_%1, 3,3
SUB_DCT4 mmx
SUB_DCT4 ssse3
+%endif ; !X264_HIGH_BIT_DEPTH
+%ifndef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
@@ -213,19 +236,22 @@ cglobal add4x4_idct_sse4, 2,2,6
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
+%endif ; !X264_HIGH_BIT_DEPTH
INIT_MMX
;-----------------------------------------------------------------------------
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11
+cglobal %1, 3,3,11*(mmsize/16)
+%ifndef X264_HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
%else
add r2, 4*FDEC_STRIDE
mova m7, [hsub_mul]
%endif
+%endif ; !X264_HIGH_BIT_DEPTH
.skip_prologue:
%ifdef WIN64
sub rsp, 8
@@ -255,7 +281,7 @@ cglobal %1, 3,3,11
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
-cglobal %1, 2,2,11
+cglobal %1, 2,2,11*(mmsize/16)
pxor m7, m7
%if mmsize==16
add r0, 4*FDEC_STRIDE
@@ -282,6 +308,11 @@ cglobal %1, 2,2,11
%endif
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+INIT_MMX
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
+%else ; !X264_HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
@@ -310,6 +341,7 @@ ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
cextern sub8x8_dct8_ssse3.skip_prologue
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+%endif ; X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index b943a61..58b9d17 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -28,9 +28,9 @@
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
-void x264_sub4x4_dct_mmx ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_mmx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_mmx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 720be56..c49895d 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -471,7 +471,10 @@
%macro LOAD_DIFF 5
-%ifidn %3, none
+%ifdef X264_HIGH_BIT_DEPTH
+ mova %1, %4
+ psubw %1, %5
+%elifidn %3, none
movh %1, %4
movh %2, %5
punpcklbw %1, %2
@@ -557,6 +560,16 @@
packuswb %2, %1
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+%macro STORE_DIFF 5
+ punpcklwd %2, %1
+ punpckhwd %3, %1
+ psrad %2, 16
+ psrad %3, 16
+ mova %4, %2
+ mova %5, %3
+%endmacro
+%else
%macro STORE_DIFF 4
movh %2, %4
punpcklbw %2, %3
@@ -565,6 +578,7 @@
packuswb %1, %1
movh %4, %1
%endmacro
+%endif
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
More information about the x264-devel
mailing list