[x264-devel] commit: SSE4 version of 4x4 idct (Holger Lubitz )
git version control
git at videolan.org
Wed Oct 7 03:27:44 CEST 2009
x264 | branch: master | Holger Lubitz <holger at lubitz.org> | Tue Oct 6 15:17:34 2009 -0700| [5cda2cf81a75bfd70a5529d3b6fb01b0a7897a5e] | committer: Jason Garrett-Glaser
SSE4 version of 4x4 idct
27->24 clocks on Nehalem.
This is really just an excuse to use "movsd" in a real function.
Add some comments to subsum-related macros in x86util.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5cda2cf81a75bfd70a5529d3b6fb01b0a7897a5e
---
common/dct.c | 4 +++
common/x86/dct-a.asm | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/dct.h | 1 +
common/x86/x86util.asm | 25 ++++++++++++++++++---
4 files changed, 81 insertions(+), 4 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 4ac9a86..c5a7913 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -482,6 +482,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
+
+ if( cpu&X264_CPU_SSE4 )
+ dctf->add4x4_idct = x264_add4x4_idct_sse4;
+
#endif //HAVE_MMX
#ifdef ARCH_PPC
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 61abd87..c4ebae5 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -27,6 +27,8 @@
%include "x86util.asm"
SECTION_RODATA
+pw_32_0: times 4 dw 32
+ times 4 dw 0
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
@@ -148,6 +150,59 @@ cglobal x264_add4x4_idct_mmx, 2,2
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
+INIT_XMM
+cglobal x264_add4x4_idct_sse4, 2,2,6
+ mova m0, [r1+0x00] ; row1/row0
+ mova m2, [r1+0x10] ; row3/row2
+ mova m1, m0 ; row1/row0
+ psraw m0, 1 ; row1>>1/...
+ mova m3, m2 ; row3/row2
+ psraw m2, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m2, m3 ; row3>>1/row2
+ psubw m0, m3 ; row1>>1-row3/row0-2
+ paddw m2, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 wd, 0, 2, 1
+ SUMSUB_BA m2, m0, m1
+ pshuflw m1, m2, 10110001b
+ pshufhw m2, m2, 10110001b
+ punpckldq m1, m0
+ punpckhdq m2, m0
+ SWAP 0, 1
+
+ mova m1, [pw_32_0 GLOBAL]
+ paddw m1, m0 ; row1/row0 corrected
+ psraw m0, 1 ; row1>>1/...
+ mova m3, m2 ; row3/row2
+ psraw m2, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m2, m3 ; row3>>1/row2
+ psubw m0, m3 ; row1>>1-row3/row0-2
+ paddw m2, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 qdq, 0, 2, 1
+ SUMSUB_BA m2, m0, m1
+
+ movd m4, [r0+FDEC_STRIDE*0]
+ movd m1, [r0+FDEC_STRIDE*1]
+ movd m3, [r0+FDEC_STRIDE*2]
+ movd m5, [r0+FDEC_STRIDE*3]
+ punpckldq m1, m4 ; row0/row1
+ pxor m4, m4
+ punpckldq m3, m5 ; row3/row2
+ punpcklbw m1, m4
+ psraw m2, 6
+ punpcklbw m3, m4
+ psraw m0, 6
+ paddsw m2, m1
+ paddsw m0, m3
+ packuswb m0, m2 ; row0/row1/row3/row2
+ pextrd [r0+FDEC_STRIDE*0], m0, 3
+ pextrd [r0+FDEC_STRIDE*1], m0, 2
+ movd [r0+FDEC_STRIDE*2], m0
+ pextrd [r0+FDEC_STRIDE*3], m0, 1
+ RET
+
+INIT_MMX
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 0502b59..9f6ed8d 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -36,6 +36,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
+void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index cfd7767..be010e5 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -28,6 +28,13 @@
SWAP %3, %4
%endmacro
+%macro SBUTTERFLY2 4
+ mova m%4, m%2
+ punpckh%1 m%2, m%3
+ punpckl%1 m%4, m%3
+ SWAP %2, %4, %3
+%endmacro
+
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
@@ -386,10 +393,10 @@
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
- psraw %2, 1
- psraw %1, 1
- paddw %2, %4
- psubw %1, %3
+ psraw %2, 1 ; %2: %2>>1
+ psraw %1, 1 ; %1: %1>>1
+ paddw %2, %4 ; %2: %2>>1+%1
+ psubw %1, %3 ; %1: %1>>1-%2
%endmacro
%macro DCT4_1D 5
@@ -410,14 +417,24 @@
%macro IDCT4_1D 5-6
%ifnum %5
SUMSUBD2_AB m%2, m%4, m%6, m%5
+ ; %2: %2>>1-%4 %4: %2+%4>>1
SUMSUB_BA m%3, m%1, m%6
+ ; %3: %1+%3 %1: %1-%3
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
+ ; %4: %1+%3 + (%2+%4>>1)
+ ; %3: %1+%3 - (%2+%4>>1)
+ ; %2: %1-%3 + (%2>>1-%4)
+ ; %1: %1-%3 - (%2>>1-%4)
%else
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
SUMSUB_BA m%3, m%1
SUMSUB_BADC m%4, m%3, m%2, m%1
%endif
SWAP %1, %4, %3
+ ; %1: %1+%3 + (%2+%4>>1) row0
+ ; %2: %1-%3 + (%2>>1-%4) row1
+ ; %3: %1-%3 - (%2>>1-%4) row2
+ ; %4: %1+%3 - (%2+%4>>1) row3
%endmacro
More information about the x264-devel
mailing list