[x264-devel] commit: SSE2 high bit depth dequant functions (Daniel Kang )

Wed Dec 15 04:19:35 CET 2010

x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Wed Dec  8 17:56:22 2010 -0500| [70271f48d601c264963191db61bb207fe426a094] | committer: Jason Garrett-Glaser 

SSE2 high bit depth dequant functions

Patch from Google Code-In.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=70271f48d601c264963191db61bb207fe426a094
---

 common/quant.c         |    3 ++
 common/x86/quant-a.asm |   89 +++++++++++++++++++++++++++++++++++++----------
 common/x86/quant.h     |    6 ++--
 tools/checkasm.c       |    6 ++--
 4 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/common/quant.c b/common/quant.c
index 816e60a..5c05a04 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -322,6 +322,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+        pf->dequant_4x4 = x264_dequant_4x4_sse2;
+        pf->dequant_8x8 = x264_dequant_8x8_sse2;
+        pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
         pf->denoise_dct = x264_denoise_dct_sse2;
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 5d7a15e..49906f4 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -473,11 +473,15 @@ QUANT_AC quant_8x8_sse4, 8
 ;;; %1      dct[y][x]
 ;;; %2,%3   dequant_mf[i_mf][y][x]
 ;;; m2      i_qbits
-
     mova     m0, %2
+%ifdef HIGH_BIT_DEPTH
+    pmaddwd  m0, %1
+    pslld    m0, m2
+%else
     packssdw m0, %3
     pmullw   m0, %1
     psllw    m0, m2
+%endif
     mova     %1, m0
 %endmacro
 
@@ -487,8 +491,12 @@ QUANT_AC quant_8x8_sse4, 8
 ;;; m2      -i_qbits
 ;;; m3      f
 ;;; m4      0
-
     mova      m0, %1
+%ifdef HIGH_BIT_DEPTH
+    pmaddwd   m0, %2
+    paddd     m0, m3
+    psrad     m0, m2
+%else
     mova      m1, m0
     punpcklwd m0, m4
     punpckhwd m1, m4
@@ -499,6 +507,7 @@ QUANT_AC quant_8x8_sse4, 8
     psrad     m0, m2
     psrad     m1, m2
     packssdw  m0, m1
+%endif
     mova      %1, m0
 %endmacro
 
@@ -506,14 +515,14 @@ QUANT_AC quant_8x8_sse4, 8
 %if 8*(%2-2*%3)
     mov t0d, 8*(%2-2*%3)
 %%loop:
-    %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
-    %1 [r0+t0     ], [r1+t0*2      ], [r1+t0*2+ 8*%3]
+    %1 [r0+(t0     )*SIZEOF_PIXEL], [r1+t0*2      ], [r1+t0*2+ 8*%3]
+    %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
     sub t0d, 16*%3
     jge %%loop
     REP_RET
 %else
-    %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
-    %1 [r0     ], [r1      ], [r1+ 8*%3]
+    %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
+    %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3]
     RET
 %endif
 %endmacro
@@ -562,10 +571,10 @@ QUANT_AC quant_8x8_sse4, 8
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
 ;-----------------------------------------------------------------------------
 %macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
 .skip_prologue:
     DEQUANT_START %3+2, %3
 
@@ -623,6 +632,13 @@ cglobal dequant_%2x%2_flat16_%1, 0,3
     RET
 %endmacro ; DEQUANT
 
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT sse2, 4, 4, 1
+DEQUANT sse4, 4, 4, 1
+DEQUANT sse2, 8, 6, 1
+DEQUANT sse4, 8, 6, 1
+%else
 %ifndef ARCH_X86_64
 INIT_MMX
 DEQUANT mmx, 4, 4, 1
@@ -631,15 +647,30 @@ DEQUANT mmx, 8, 6, 1
 INIT_XMM
 DEQUANT sse2, 4, 4, 2
 DEQUANT sse2, 8, 6, 2
+%endif
 
-%macro DEQUANT_DC 1
-cglobal dequant_4x4dc_%1, 0,3
+%macro DEQUANT_DC 2
+cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
     DEQUANT_START 6, 6
 
 .lshift:
-    movd   m3, [r1]
-    movd   m2, t0d
-    pslld  m3, m2
+    movd     m3, [r1]
+    movd     m2, t0d
+    pslld    m3, m2
+%ifdef HIGH_BIT_DEPTH
+    pshufd   m3, m3, 0
+%assign x 0
+%rep SIZEOF_PIXEL*16/mmsize
+    mova     m0, [r0+mmsize*0+x]
+    mova     m1, [r0+mmsize*1+x]
+    pmaddwd  m0, m3
+    pmaddwd  m1, m3
+    mova     [r0+mmsize*0+x], m0
+    mova     [r0+mmsize*1+x], m1
+%assign x x+mmsize*2
+%endrep
+
+%else ; !HIGH_BIT_DEPTH
 %if mmsize==16
     pshuflw  m3, m3, 0
     punpcklqdq m3, m3
@@ -647,7 +678,7 @@ cglobal dequant_4x4dc_%1, 0,3
     pshufw   m3, m3, 0
 %endif
 %assign x 0
-%rep 16/mmsize
+%rep SIZEOF_PIXEL*16/mmsize
     mova     m0, [r0+mmsize*0+x]
     mova     m1, [r0+mmsize*1+x]
     pmullw   m0, m3
@@ -656,24 +687,37 @@ cglobal dequant_4x4dc_%1, 0,3
     mova     [r0+mmsize*1+x], m1
 %assign x x+mmsize*2
 %endrep
+%endif ; HIGH_BIT_DEPTH
     RET
 
 .rshift32:
     neg   t0d
     movd  m3, t0d
-    mova  m4, [pw_1]
+    mova  m4, [p%2_1]
     mova  m5, m4
     pslld m4, m3
     psrld m4, 1
     movd  m2, [r1]
+%assign x 0
+%ifdef HIGH_BIT_DEPTH
+    pshufd m2, m2, 0
+%rep SIZEOF_PIXEL*32/mmsize
+    mova      m0, [r0+x]
+    pmaddwd   m0, m2
+    paddd     m0, m4
+    psrad     m0, m3
+    mova      [r0+x], m0
+%assign x x+mmsize
+%endrep
+
+%else
 %if mmsize==8
     punpcklwd m2, m2
 %else
     pshuflw m2, m2, 0
 %endif
     punpcklwd m2, m4
-%assign x 0
-%rep 32/mmsize
+%rep SIZEOF_PIXEL*32/mmsize
     mova      m0, [r0+x]
     mova      m1, m0
     punpcklwd m0, m5
@@ -686,13 +730,20 @@ cglobal dequant_4x4dc_%1, 0,3
     mova      [r0+x], m0
 %assign x x+mmsize
 %endrep
+%endif
     RET
 %endmacro
 
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT_DC sse2  , d
+DEQUANT_DC sse4  , d
+%else
 INIT_MMX
-DEQUANT_DC mmxext
+DEQUANT_DC mmxext, w
 INIT_XMM
-DEQUANT_DC sse2
+DEQUANT_DC sse2  , w
+%endif
 
 %ifdef HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/quant.h b/common/x86/quant.h
index a28099c..2c47d7c 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -47,9 +47,9 @@ int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 020bcab..4a05d2b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -677,8 +677,8 @@ static int check_dct( int cpu_ref, int cpu_new )
         for( int i = 0; i < 16 && ok; i++ )\
         {\
             for( int j = 0; j < 16; j++ )\
-                dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
-                           : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+                dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
                            : ((*p++)&0x1fff)-0x1000; /* general case */\
             memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
             call_c1( dct_c.name, dct1[0] );\
@@ -1533,7 +1533,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int i = 0; i < 16; i++ ) \
-                    dct1[i] = rand(); \
+                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
                 call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \