[x264-devel] commit: special case dequant for flat matrix (Loren Merritt )
git version control
git at videolan.org
Sat Mar 22 03:06:09 CET 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Fri Mar 21 18:46:29 2008 -0600| [afba69a247ee3ff4ae9781cb63093529175ec135]
special case dequant for flat matrix
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=afba69a247ee3ff4ae9781cb63093529175ec135
---
common/quant.c | 10 +++
common/x86/quant-a.asm | 178 +++++++++++++++++++++++++++++++++++-------------
common/x86/quant.h | 4 +
tools/checkasm.c | 7 ++
4 files changed, 152 insertions(+), 47 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index 024dc6e..270f979 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -211,6 +211,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
+ if( h->param.i_cqm_preset == X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
+ pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
+ }
#endif
}
@@ -229,6 +234,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->dequant_4x4 = x264_dequant_4x4_sse2;
pf->dequant_8x8 = x264_dequant_8x8_sse2;
+ if( h->param.i_cqm_preset == X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
+ pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
+ }
}
#endif
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 886b340..c986826 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -25,7 +25,34 @@
SECTION_RODATA
pw_1: times 8 dw 1
-pd_10000: times 4 dd 1<<16
+pd_1: times 4 dd 1
+
+%macro DQM4 3
+ dw %1, %2, %1, %2, %2, %3, %2, %3
+%endmacro
+%macro DQM8 6
+ dw %1, %4, %5, %4, %1, %4, %5, %4
+ dw %4, %2, %6, %2, %4, %2, %6, %2
+ dw %5, %6, %3, %6, %5, %6, %3, %6
+ ; last line not used, just padding for power-of-2 stride
+ times 8 dw 0
+%endmacro
+
+dequant4_scale:
+ DQM4 10, 13, 16
+ DQM4 11, 14, 18
+ DQM4 13, 16, 20
+ DQM4 14, 18, 23
+ DQM4 16, 20, 25
+ DQM4 18, 23, 29
+
+dequant8_scale:
+ DQM8 20, 18, 32, 19, 25, 24
+ DQM8 22, 19, 35, 21, 28, 26
+ DQM8 26, 23, 42, 24, 33, 31
+ DQM8 28, 25, 45, 26, 35, 33
+ DQM8 32, 28, 51, 30, 40, 38
+ DQM8 36, 32, 58, 34, 46, 43
SECTION .text
@@ -139,11 +166,9 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m5 i_qbits
- movq m1, %2
- movq m2, %3
- movq m0, %1
- packssdw m1, m2
- pmullw m0, m1
+ movq m0, %2
+ packssdw m0, %3
+ pmullw m0, %1
psllw m0, m5
movq %1, m0
%endmacro
@@ -151,21 +176,18 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
%macro DEQUANT32_R 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; m4 f
;;; m5 -i_qbits
-;;; m6 1
+;;; m6 f
;;; m7 0
movq m0, %1
movq m1, m0
- movq m2, %2
- movq m3, %3
- punpcklwd m0, m4
- punpckhwd m1, m4
- por m2, m6 ; FIXME munge precomputed arrays?
- por m3, m6
- pmaddwd m0, m2
- pmaddwd m1, m3
+ punpcklwd m0, m7
+ punpckhwd m1, m7
+ pmaddwd m0, %2
+ pmaddwd m1, %3
+ paddd m0, m6
+ paddd m1, m6
psrad m0, m5
psrad m1, m5
packssdw m0, m1
@@ -188,60 +210,122 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
%endif
%endmacro
-;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-;-----------------------------------------------------------------------------
-%macro DEQUANT_WxH 4
-cglobal %1, 0,3
+%macro DEQUANT16_FLAT 2-8
+ movq m0, %1
+%assign i %0-2
+%rep %0-1
+%if i
+ movq m %+ i, [r0+%2]
+ pmullw m %+ i, m0
+%else
+ pmullw m0, [r0+%2]
+%endif
+ psllw m %+ i, m7
+ movq [r0+%2], m %+ i
+ %assign i i-1
+ %rotate 1
+%endrep
+%endmacro
+
%ifdef ARCH_X86_64
%define t0 r4
%define t0d r4d
- imul r4d, r2d, 0x2b
- shr r4d, 8 ; i_qbits = i_qp / 6
- lea r3, [r4*3]
- sub r2d, r3d
- sub r2d, r3d ; i_mf = i_qp % 6
- shl r2d, %3+2
- add r1, r2 ; dequant_mf[i_mf]
+ %define t1 r3
+ %define t1d r3d
+ %define t2 r2
+ %define t2d r2d
%else
%define t0 r2
%define t0d r2d
- mov r1, r2m ; i_qp
- imul r2, r1, 0x2b
- shr r2, 8 ; i_qbits = i_qp / 6
- lea r0, [r2*3]
- sub r1, r0
- sub r1, r0 ; i_mf = i_qp % 6
- shl r1, %3+2
+ %define t1 r0
+ %define t1d r0d
+ %define t2 r1
+ %define t2d r1d
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+;-----------------------------------------------------------------------------
+%macro DEQUANT 4
+cglobal x264_dequant_%2x%2_%1, 0,3
+ movifnidn t2d, r2m
+ imul t0d, t2d, 0x2b
+ shr t0d, 8 ; i_qbits = i_qp / 6
+ lea t1, [t0*3]
+ sub t2d, t1d
+ sub t2d, t1d ; i_mf = i_qp % 6
+ shl t2d, %3+2
+%ifdef ARCH_X86_64
+ add r1, t2 ; dequant_mf[i_mf]
+%else
add r1, r1m ; dequant_mf[i_mf]
mov r0, r0m ; dct
%endif
-
sub t0d, %3
jl .rshift32 ; negative qbits => rightshift
.lshift:
movd m5, t0d
- DEQUANT_LOOP DEQUANT16_L, %2, %4
+ DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
.rshift32:
neg t0d
movd m5, t0d
picgetgot t0d
- movq m4, [pw_1 GLOBAL]
- movq m6, [pd_10000 GLOBAL]
- psllw m4, m5
+ movq m6, [pd_1 GLOBAL]
pxor m7, m7
- psrlw m4, 1
- DEQUANT_LOOP DEQUANT32_R, %2, %4
-%endmacro
+ pslld m6, m5
+ psrld m6, 1
+ DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
+
+cglobal x264_dequant_%2x%2_flat16_%1, 0,3
+ movifnidn t2d, r2m
+%if %2 == 8
+ cmp t2d, 12
+ jl x264_dequant_%2x%2_%1
+ sub t2d, 12
+%endif
+ imul t0d, t2d, 0x2b
+ shr t0d, 8 ; i_qbits = i_qp / 6
+ lea t1, [t0*3]
+ sub t2d, t1d
+ sub t2d, t1d ; i_mf = i_qp % 6
+ shl t2d, %3
+%ifdef PIC64
+ lea r1, [dequant%2_scale GLOBAL]
+ add r1, t2
+%else
+ picgetgot r0
+ lea r1, [t2 + dequant%2_scale GLOBAL]
+%endif
+ movifnidn r0d, r0m
+ movd m7, t0d
+%if %2 == 4
+%ifidn %1, mmx
+ DEQUANT16_FLAT [r1], 0, 16
+ DEQUANT16_FLAT [r1+8], 8, 24
+%else
+ DEQUANT16_FLAT [r1], 0, 16
+%endif
+%elifidn %1, mmx
+ DEQUANT16_FLAT [r1], 0, 8, 64, 72
+ DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
+ DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
+ DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
+%else
+ DEQUANT16_FLAT [r1], 0, 64
+ DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
+ DEQUANT16_FLAT [r1+32], 32, 96
+%endif
+ ret
+%endmacro ; DEQUANT
%ifndef ARCH_X86_64
INIT_MMX
-DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4, 1
-DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6, 1
+DEQUANT mmx, 4, 4, 1
+DEQUANT mmx, 8, 6, 1
%endif
INIT_XMM
-DEQUANT_WxH x264_dequant_4x4_sse2, 4, 4, 2
-DEQUANT_WxH x264_dequant_8x8_sse2, 16, 6, 2
+DEQUANT sse2, 4, 4, 2
+DEQUANT sse2, 8, 6, 2
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 587286c..f860f6a 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -37,5 +37,9 @@ void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
#endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index f8f2e35..74b2bf9 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -648,11 +648,17 @@ static int check_quant( int cpu_ref, int cpu_new )
for( i_cqm = 0; i_cqm < 4; i_cqm++ )
{
if( i_cqm == 0 )
+ {
for( i = 0; i < 6; i++ )
h->pps->scaling_list[i] = x264_cqm_flat16;
+ h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_FLAT;
+ }
else if( i_cqm == 1 )
+ {
for( i = 0; i < 6; i++ )
h->pps->scaling_list[i] = x264_cqm_jvt[i];
+ h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_JVT;
+ }
else
{
if( i_cqm == 2 )
@@ -663,6 +669,7 @@ static int check_quant( int cpu_ref, int cpu_new )
cqm_buf[i] = 1;
for( i = 0; i < 6; i++ )
h->pps->scaling_list[i] = cqm_buf;
+ h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_CUSTOM;
}
x264_cqm_init( h );
More information about the x264-devel
mailing list