[x264-devel] commit: SSE2 high bit depth dequant functions (Daniel Kang )
git at videolan.org
git at videolan.org
Wed Dec 15 04:19:35 CET 2010
x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Wed Dec 8 17:56:22 2010 -0500| [70271f48d601c264963191db61bb207fe426a094] | committer: Jason Garrett-Glaser
SSE2 high bit depth dequant functions
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=70271f48d601c264963191db61bb207fe426a094
---
common/quant.c | 3 ++
common/x86/quant-a.asm | 89 +++++++++++++++++++++++++++++++++++++----------
common/x86/quant.h | 6 ++--
tools/checkasm.c | 6 ++--
4 files changed, 79 insertions(+), 25 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index 816e60a..5c05a04 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -322,6 +322,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+ pf->dequant_4x4 = x264_dequant_4x4_sse2;
+ pf->dequant_8x8 = x264_dequant_8x8_sse2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 5d7a15e..49906f4 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -473,11 +473,15 @@ QUANT_AC quant_8x8_sse4, 8
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
-
mova m0, %2
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %1
+ pslld m0, m2
+%else
packssdw m0, %3
pmullw m0, %1
psllw m0, m2
+%endif
mova %1, m0
%endmacro
@@ -487,8 +491,12 @@ QUANT_AC quant_8x8_sse4, 8
;;; m2 -i_qbits
;;; m3 f
;;; m4 0
-
mova m0, %1
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %2
+ paddd m0, m3
+ psrad m0, m2
+%else
mova m1, m0
punpcklwd m0, m4
punpckhwd m1, m4
@@ -499,6 +507,7 @@ QUANT_AC quant_8x8_sse4, 8
psrad m0, m2
psrad m1, m2
packssdw m0, m1
+%endif
mova %1, m0
%endmacro
@@ -506,14 +515,14 @@ QUANT_AC quant_8x8_sse4, 8
%if 8*(%2-2*%3)
mov t0d, 8*(%2-2*%3)
%%loop:
- %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
- %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
sub t0d, 16*%3
jge %%loop
REP_RET
%else
- %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
- %1 [r0 ], [r1 ], [r1+ 8*%3]
+ %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
+ %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
RET
%endif
%endmacro
@@ -562,10 +571,10 @@ QUANT_AC quant_8x8_sse4, 8
%endmacro
;-----------------------------------------------------------------------------
-; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
.skip_prologue:
DEQUANT_START %3+2, %3
@@ -623,6 +632,13 @@ cglobal dequant_%2x%2_flat16_%1, 0,3
RET
%endmacro ; DEQUANT
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT sse2, 4, 4, 1
+DEQUANT sse4, 4, 4, 1
+DEQUANT sse2, 8, 6, 1
+DEQUANT sse4, 8, 6, 1
+%else
%ifndef ARCH_X86_64
INIT_MMX
DEQUANT mmx, 4, 4, 1
@@ -631,15 +647,30 @@ DEQUANT mmx, 8, 6, 1
INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+%endif
-%macro DEQUANT_DC 1
-cglobal dequant_4x4dc_%1, 0,3
+%macro DEQUANT_DC 2
+cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
DEQUANT_START 6, 6
.lshift:
- movd m3, [r1]
- movd m2, t0d
- pslld m3, m2
+ movd m3, [r1]
+ movd m2, t0d
+ pslld m3, m2
+%ifdef HIGH_BIT_DEPTH
+ pshufd m3, m3, 0
+%assign x 0
+%rep SIZEOF_PIXEL*16/mmsize
+ mova m0, [r0+mmsize*0+x]
+ mova m1, [r0+mmsize*1+x]
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ mova [r0+mmsize*0+x], m0
+ mova [r0+mmsize*1+x], m1
+%assign x x+mmsize*2
+%endrep
+
+%else ; !HIGH_BIT_DEPTH
%if mmsize==16
pshuflw m3, m3, 0
punpcklqdq m3, m3
@@ -647,7 +678,7 @@ cglobal dequant_4x4dc_%1, 0,3
pshufw m3, m3, 0
%endif
%assign x 0
-%rep 16/mmsize
+%rep SIZEOF_PIXEL*16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
pmullw m0, m3
@@ -656,24 +687,37 @@ cglobal dequant_4x4dc_%1, 0,3
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
+%endif ; HIGH_BIT_DEPTH
RET
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [pw_1]
+ mova m4, [p%2_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
movd m2, [r1]
+%assign x 0
+%ifdef HIGH_BIT_DEPTH
+ pshufd m2, m2, 0
+%rep SIZEOF_PIXEL*32/mmsize
+ mova m0, [r0+x]
+ pmaddwd m0, m2
+ paddd m0, m4
+ psrad m0, m3
+ mova [r0+x], m0
+%assign x x+mmsize
+%endrep
+
+%else
%if mmsize==8
punpcklwd m2, m2
%else
pshuflw m2, m2, 0
%endif
punpcklwd m2, m4
-%assign x 0
-%rep 32/mmsize
+%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
mova m1, m0
punpcklwd m0, m5
@@ -686,13 +730,20 @@ cglobal dequant_4x4dc_%1, 0,3
mova [r0+x], m0
%assign x x+mmsize
%endrep
+%endif
RET
%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT_DC sse2 , d
+DEQUANT_DC sse4 , d
+%else
INIT_MMX
-DEQUANT_DC mmxext
+DEQUANT_DC mmxext, w
INIT_XMM
-DEQUANT_DC sse2
+DEQUANT_DC sse2 , w
+%endif
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
diff --git a/common/x86/quant.h b/common/x86/quant.h
index a28099c..2c47d7c 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -47,9 +47,9 @@ int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 020bcab..4a05d2b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -677,8 +677,8 @@ static int check_dct( int cpu_ref, int cpu_new )
for( int i = 0; i < 16 && ok; i++ )\
{\
for( int j = 0; j < 16; j++ )\
- dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
- : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+ dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+ : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
: ((*p++)&0x1fff)-0x1000; /* general case */\
memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
call_c1( dct_c.name, dct1[0] );\
@@ -1533,7 +1533,7 @@ static int check_quant( int cpu_ref, int cpu_new )
for( int qp = QP_MAX; qp > 0; qp-- ) \
{ \
for( int i = 0; i < 16; i++ ) \
- dct1[i] = rand(); \
+ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
More information about the x264-devel
mailing list