[x264-devel] commit: SSE2 high bit depth 8x8/16x16 idct/idct_dc (David Czech )
git at videolan.org
git at videolan.org
Tue Dec 7 09:15:51 CET 2010
x264 | branch: master | David Czech <davidczech510 at gmail.com> | Sat Nov 27 17:34:32 2010 -0800| [3f2f3f90175e1da7cc83c70ba22ff823bc657092] | committer: Jason Garrett-Glaser
SSE2 high bit depth 8x8/16x16 idct/idct_dc
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3f2f3f90175e1da7cc83c70ba22ff823bc657092
---
common/dct.c | 10 ++++--
common/x86/dct-a.asm | 82 ++++++++++++++++++++++++++++++++++++++++++++-----
common/x86/dct.h | 7 ++--
3 files changed, 84 insertions(+), 15 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 6a19f06..788452b 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -431,9 +431,13 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
}
if( cpu&X264_CPU_SSE2 )
{
- dctf->add4x4_idct = x264_add4x4_idct_sse2;
- dctf->dct4x4dc = x264_dct4x4dc_sse2;
- dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->add4x4_idct = x264_add4x4_idct_sse2;
+ dctf->dct4x4dc = x264_dct4x4dc_sse2;
+ dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->add8x8_idct = x264_add8x8_idct_sse2;
+ dctf->add16x16_idct = x264_add16x16_idct_sse2;
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
+ dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 5e063eb..464a00b 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -241,15 +241,14 @@ SUB_DCT4 ssse3
movq %3, %5
movhps %3, %6
paddsw %1, %3
- pxor %4, %4
CLIPW %1, %4, [pw_pixel_max]
movq %5, %1
movhps %6, %1
%endmacro
INIT_XMM
-cglobal add4x4_idct_sse2, 2,2,7
- pxor m6, m6
+cglobal add4x4_idct_sse2, 2,2,6
+ add r0, 4*FDEC_STRIDE
.skip_prologue:
mova m1, [r1+16]
mova m3, [r1+48]
@@ -259,8 +258,9 @@ cglobal add4x4_idct_sse2, 2,2,7
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
- STORE_DIFFx2 m0, m1, m4, m6, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
- STORE_DIFFx2 m2, m3, m4, m6, [r0+4*FDEC_STRIDE], [r0+6*FDEC_STRIDE]
+ pxor m5, m5
+ STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
+ STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
RET
%else
@@ -365,8 +365,8 @@ cglobal %1, 3,3,11*(mmsize/16)
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
- add rsp, 8
call %2
+ add rsp, 8
RET
%else
jmp %2
@@ -377,8 +377,12 @@ cglobal %1, 3,3,11*(mmsize/16)
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
+%ifdef HIGH_BIT_DEPTH
+cglobal %1, 2,2,6*(mmsize/16)
+%else
cglobal %1, 2,2,11*(mmsize/16)
pxor m7, m7
+%endif
%if mmsize==16
add r0, 4*FDEC_STRIDE
%endif
@@ -396,8 +400,8 @@ cglobal %1, 2,2,11*(mmsize/16)
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
%ifdef WIN64
- add rsp, 8
call %2
+ add rsp, 8
RET
%else
jmp %2
@@ -408,6 +412,9 @@ cglobal %1, 2,2,11*(mmsize/16)
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
+INIT_XMM
+ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
%else ; !HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
@@ -439,11 +446,66 @@ cextern sub8x8_dct8_ssse3.skip_prologue
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH
-
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
;-----------------------------------------------------------------------------
-; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
+; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
;-----------------------------------------------------------------------------
+%macro ADD_DC 2
+ mova m0, [%1+SIZEOF_PIXEL*FDEC_STRIDE*0] ; 8pixels
+ mova m1, [%1+SIZEOF_PIXEL*FDEC_STRIDE*1]
+ mova m2, [%1+SIZEOF_PIXEL*FDEC_STRIDE*2]
+ paddsw m0, %2
+ paddsw m1, %2
+ paddsw m2, %2
+ paddsw %2, [%1+SIZEOF_PIXEL*FDEC_STRIDE*3]
+ CLIPW m0, m5, m6
+ CLIPW m1, m5, m6
+ CLIPW m2, m5, m6
+ CLIPW %2, m5, m6
+ mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*0], m0
+ mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*1], m1
+ mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*2], m2
+ mova [%1+SIZEOF_PIXEL*FDEC_STRIDE*3], %2
+%endmacro
+
+INIT_XMM
+cglobal add8x8_idct_dc_sse2, 2,2,7
+ mova m6, [pw_pixel_max]
+ pxor m5, m5
+ mova m3, [r1]
+ paddd m3, [pd_32]
+ psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
+ pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
+ pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
+ pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*0, m4
+ ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*4, m3
+ RET
+cglobal add16x16_idct_dc_sse2, 2,3,8
+ mov r2, 4
+ mova m6, [pw_pixel_max]
+ mova m7, [pd_32]
+ pxor m5, m5
+.loop
+ mova m3, [r1]
+ paddd m3, m7
+ psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
+ pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
+ pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
+ pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ ADD_DC r0+SIZEOF_PIXEL*FDEC_STRIDE*0, m4
+ ADD_DC r0+SIZEOF_PIXEL*8, m3
+ add r1, 16
+ add r0, 4*FDEC_STRIDE*SIZEOF_PIXEL
+ dec r2
+ jg .loop
+ REP_RET
+
+%else ;!HIGH_BIT_DEPTH
%macro ADD_DC 3
movq mm4, [%3+FDEC_STRIDE*0]
movq mm5, [%3+FDEC_STRIDE*1]
@@ -625,6 +687,8 @@ cglobal add16x16_idct_dc_ssse3, 2,2,8
IDCT_DC_STORE 0, xmm2, xmm3
ret
+%endif ; HIGH_BIT_DEPTH
+
;-----------------------------------------------------------------------------
; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index c1a4c59..bb8c250 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -46,9 +46,10 @@ void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [16] );
-void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][16] );
-void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][16] );
-void x264_add16x16_idct_dc_sse2 ( uint8_t *p_dst, int16_t dct [16] );
+void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
+void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
+void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
+void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
More information about the x264-devel
mailing list