[x264-devel] commit: special case dequant for flat matrix (Loren Merritt )

Sat Mar 22 03:06:09 CET 2008

x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Fri Mar 21 18:46:29 2008 -0600| [afba69a247ee3ff4ae9781cb63093529175ec135]

special case dequant for flat matrix

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=afba69a247ee3ff4ae9781cb63093529175ec135
---

 common/quant.c         |   10 +++
 common/x86/quant-a.asm |  178 +++++++++++++++++++++++++++++++++++-------------
 common/x86/quant.h     |    4 +
 tools/checkasm.c       |    7 ++
 4 files changed, 152 insertions(+), 47 deletions(-)

diff --git a/common/quant.c b/common/quant.c
index 024dc6e..270f979 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -211,6 +211,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
+        if( h->param.i_cqm_preset == X264_CQM_FLAT )
+        {
+            pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
+            pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
+        }
 #endif
     }
 
@@ -229,6 +234,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->dequant_4x4 = x264_dequant_4x4_sse2;
         pf->dequant_8x8 = x264_dequant_8x8_sse2;
+        if( h->param.i_cqm_preset == X264_CQM_FLAT )
+        {
+            pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
+            pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
+        }
     }
 #endif
 
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 886b340..c986826 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -25,7 +25,34 @@
 
 SECTION_RODATA
 pw_1:     times 8 dw 1
-pd_10000: times 4 dd 1<<16
+pd_1:     times 4 dd 1
+
+%macro DQM4 3
+    dw %1, %2, %1, %2, %2, %3, %2, %3
+%endmacro
+%macro DQM8 6
+    dw %1, %4, %5, %4, %1, %4, %5, %4
+    dw %4, %2, %6, %2, %4, %2, %6, %2
+    dw %5, %6, %3, %6, %5, %6, %3, %6
+    ; last line not used, just padding for power-of-2 stride
+    times 8 dw 0
+%endmacro
+
+dequant4_scale:
+    DQM4 10, 13, 16
+    DQM4 11, 14, 18
+    DQM4 13, 16, 20
+    DQM4 14, 18, 23
+    DQM4 16, 20, 25
+    DQM4 18, 23, 29
+
+dequant8_scale:
+    DQM8 20, 18, 32, 19, 25, 24
+    DQM8 22, 19, 35, 21, 28, 26
+    DQM8 26, 23, 42, 24, 33, 31
+    DQM8 28, 25, 45, 26, 35, 33
+    DQM8 32, 28, 51, 30, 40, 38
+    DQM8 36, 32, 58, 34, 46, 43
 
 SECTION .text
 
@@ -139,11 +166,9 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
 ;;; %2,%3   dequant_mf[i_mf][y][x]
 ;;; m5      i_qbits
 
-    movq     m1, %2
-    movq     m2, %3
-    movq     m0, %1
-    packssdw m1, m2
-    pmullw   m0, m1
+    movq     m0, %2
+    packssdw m0, %3
+    pmullw   m0, %1
     psllw    m0, m5
     movq     %1, m0
 %endmacro
@@ -151,21 +176,18 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
 %macro DEQUANT32_R 3
 ;;; %1      dct[y][x]
 ;;; %2,%3   dequant_mf[i_mf][y][x]
-;;; m4      f
 ;;; m5      -i_qbits
-;;; m6      1
+;;; m6      f
 ;;; m7      0
 
     movq      m0, %1
     movq      m1, m0
-    movq      m2, %2
-    movq      m3, %3
-    punpcklwd m0, m4
-    punpckhwd m1, m4
-    por       m2, m6 ; FIXME munge precomputed arrays?
-    por       m3, m6
-    pmaddwd   m0, m2
-    pmaddwd   m1, m3
+    punpcklwd m0, m7
+    punpckhwd m1, m7
+    pmaddwd   m0, %2
+    pmaddwd   m1, %3
+    paddd     m0, m6
+    paddd     m1, m6
     psrad     m0, m5
     psrad     m1, m5
     packssdw  m0, m1
@@ -188,60 +210,122 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
 %endif
 %endmacro
 
-;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-;-----------------------------------------------------------------------------
-%macro DEQUANT_WxH 4
-cglobal %1, 0,3
+%macro DEQUANT16_FLAT 2-8
+    movq   m0, %1
+%assign i %0-2
+%rep %0-1
+%if i
+    movq   m %+ i, [r0+%2]
+    pmullw m %+ i, m0
+%else
+    pmullw m0, [r0+%2]
+%endif
+    psllw  m %+ i, m7
+    movq   [r0+%2], m %+ i
+    %assign i i-1
+    %rotate 1
+%endrep
+%endmacro
+
 %ifdef ARCH_X86_64
     %define t0  r4
     %define t0d r4d
-    imul r4d, r2d, 0x2b
-    shr  r4d, 8     ; i_qbits = i_qp / 6
-    lea  r3, [r4*3]
-    sub  r2d, r3d
-    sub  r2d, r3d   ; i_mf = i_qp % 6
-    shl  r2d, %3+2
-    add  r1, r2     ; dequant_mf[i_mf]
+    %define t1  r3
+    %define t1d r3d
+    %define t2  r2
+    %define t2d r2d
 %else
     %define t0  r2
     %define t0d r2d
-    mov  r1, r2m    ; i_qp
-    imul r2, r1, 0x2b
-    shr  r2, 8      ; i_qbits = i_qp / 6
-    lea  r0, [r2*3]
-    sub  r1, r0
-    sub  r1, r0     ; i_mf = i_qp % 6
-    shl  r1, %3+2
+    %define t1  r0
+    %define t1d r0d
+    %define t2  r1
+    %define t2d r1d
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+;-----------------------------------------------------------------------------
+%macro DEQUANT 4
+cglobal x264_dequant_%2x%2_%1, 0,3
+    movifnidn t2d, r2m
+    imul t0d, t2d, 0x2b
+    shr  t0d, 8     ; i_qbits = i_qp / 6
+    lea  t1, [t0*3]
+    sub  t2d, t1d
+    sub  t2d, t1d   ; i_mf = i_qp % 6
+    shl  t2d, %3+2
+%ifdef ARCH_X86_64
+    add  r1, t2     ; dequant_mf[i_mf]
+%else
     add  r1, r1m    ; dequant_mf[i_mf]
     mov  r0, r0m    ; dct
 %endif
-
     sub  t0d, %3
     jl   .rshift32  ; negative qbits => rightshift
 
 .lshift:
     movd m5, t0d
-    DEQUANT_LOOP DEQUANT16_L, %2, %4
+    DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
 
 .rshift32:
     neg   t0d
     movd  m5, t0d
     picgetgot t0d
-    movq  m4, [pw_1 GLOBAL]
-    movq  m6, [pd_10000 GLOBAL]
-    psllw m4, m5
+    movq  m6, [pd_1 GLOBAL]
     pxor  m7, m7
-    psrlw m4, 1
-    DEQUANT_LOOP DEQUANT32_R, %2, %4
-%endmacro
+    pslld m6, m5
+    psrld m6, 1
+    DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+
+cglobal x264_dequant_%2x%2_flat16_%1, 0,3
+    movifnidn t2d, r2m
+%if %2 == 8
+    cmp  t2d, 12
+    jl x264_dequant_%2x%2_%1
+    sub  t2d, 12
+%endif
+    imul t0d, t2d, 0x2b
+    shr  t0d, 8     ; i_qbits = i_qp / 6
+    lea  t1, [t0*3]
+    sub  t2d, t1d
+    sub  t2d, t1d   ; i_mf = i_qp % 6
+    shl  t2d, %3
+%ifdef PIC64
+    lea  r1, [dequant%2_scale GLOBAL]
+    add  r1, t2
+%else
+    picgetgot r0
+    lea  r1, [t2 + dequant%2_scale GLOBAL]
+%endif
+    movifnidn r0d, r0m
+    movd m7, t0d
+%if %2 == 4
+%ifidn %1, mmx
+    DEQUANT16_FLAT [r1], 0, 16
+    DEQUANT16_FLAT [r1+8], 8, 24
+%else
+    DEQUANT16_FLAT [r1], 0, 16
+%endif
+%elifidn %1, mmx
+    DEQUANT16_FLAT [r1], 0, 8, 64, 72
+    DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
+    DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
+    DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
+%else
+    DEQUANT16_FLAT [r1], 0, 64
+    DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
+    DEQUANT16_FLAT [r1+32], 32, 96
+%endif
+    ret
+%endmacro ; DEQUANT
 
 %ifndef ARCH_X86_64
 INIT_MMX
-DEQUANT_WxH x264_dequant_4x4_mmx, 4,  4, 1
-DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6, 1
+DEQUANT mmx, 4, 4, 1
+DEQUANT mmx, 8, 6, 1
 %endif
 INIT_XMM
-DEQUANT_WxH x264_dequant_4x4_sse2, 4,  4, 2
-DEQUANT_WxH x264_dequant_8x8_sse2, 16, 6, 2
+DEQUANT sse2, 4, 4, 2
+DEQUANT sse2, 8, 6, 2
 
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 587286c..f860f6a 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -37,5 +37,9 @@ void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp
 void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
 void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
 
 #endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index f8f2e35..74b2bf9 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -648,11 +648,17 @@ static int check_quant( int cpu_ref, int cpu_new )
     for( i_cqm = 0; i_cqm < 4; i_cqm++ )
     {
         if( i_cqm == 0 )
+        {
             for( i = 0; i < 6; i++ )
                 h->pps->scaling_list[i] = x264_cqm_flat16;
+            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_FLAT;
+        }
         else if( i_cqm == 1 )
+        {
             for( i = 0; i < 6; i++ )
                 h->pps->scaling_list[i] = x264_cqm_jvt[i];
+            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_JVT;
+        }
         else
         {
             if( i_cqm == 2 )
@@ -663,6 +669,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                     cqm_buf[i] = 1;
             for( i = 0; i < 6; i++ )
                 h->pps->scaling_list[i] = cqm_buf;
+            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_CUSTOM;
         }
 
         x264_cqm_init( h );