[x264-devel] commit: faster dequant (Loren Merritt )
git version control
git at videolan.org
Fri Mar 21 08:56:15 CET 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Fri Mar 21 00:04:46 2008 -0600| [02e610262bac2645742cfaa40d018fd43f26e859]
faster dequant
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=02e610262bac2645742cfaa40d018fd43f26e859
---
common/quant.c | 4 +-
common/x86/deblock-a.asm | 28 ----------
common/x86/quant-a.asm | 127 ++++++++++++++++++++++++----------------------
common/x86/quant.h | 2 +
common/x86/x86inc.asm | 28 ++++++++++
5 files changed, 100 insertions(+), 89 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index ed1148c..024dc6e 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -209,9 +209,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#ifdef ARCH_X86
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
-#endif
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
+#endif
}
if( cpu&X264_CPU_MMXEXT )
@@ -227,6 +227,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
+ pf->dequant_4x4 = x264_dequant_4x4_sse2;
+ pf->dequant_8x8 = x264_dequant_8x8_sse2;
}
#endif
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 8071678..fd87234 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -29,34 +29,6 @@ pb_a1: times 16 db 0xa1
SECTION .text
-%macro INIT_MMX 0
- %undef movq
- %define m0 mm0
- %define m1 mm1
- %define m2 mm2
- %define m3 mm3
- %define m4 mm4
- %define m5 mm5
- %define m6 mm6
- %define m7 mm7
- %undef m8
- %undef m9
-%endmacro
-
-%macro INIT_XMM 0
- %define movq movdqa
- %define m0 xmm0
- %define m1 xmm1
- %define m2 xmm2
- %define m3 xmm3
- %define m4 xmm4
- %define m5 xmm5
- %define m6 xmm6
- %define m7 xmm7
- %define m8 xmm8
- %define m9 xmm9
-%endmacro
-
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 11e78ea..886b340 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -24,7 +24,8 @@
%include "x86inc.asm"
SECTION_RODATA
-pd_1: times 2 dd 1
+pw_1: times 8 dw 1
+pd_10000: times 4 dd 1<<16
SECTION .text
@@ -133,76 +134,74 @@ QUANT_AC x264_quant_8x8_ssse3, SSSE3_QUANT_1x8, 8, 16
; dequant
;=============================================================================
-%macro DEQUANT16_L_1x4 3
+%macro DEQUANT16_L 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; mm5 i_qbits
-
- movq mm1, %2
- movq mm2, %3
- movq mm0, %1
- packssdw mm1, mm2
- pmullw mm0, mm1
- psllw mm0, mm5
- movq %1, mm0
+;;; m5 i_qbits
+
+ movq m1, %2
+ movq m2, %3
+ movq m0, %1
+ packssdw m1, m2
+ pmullw m0, m1
+ psllw m0, m5
+ movq %1, m0
%endmacro
-%macro DEQUANT32_R_1x4 3
+%macro DEQUANT32_R 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; mm5 -i_qbits
-;;; mm6 f as dwords
-;;; mm7 0
-
- movq mm0, %1
- movq mm1, mm0
- punpcklwd mm0, mm0
- punpckhwd mm1, mm1
-
- movq mm2, mm0
- movq mm3, mm1
- pmulhw mm0, %2
- pmulhw mm1, %3
- pmullw mm2, %2
- pmullw mm3, %3
- pslld mm0, 16
- pslld mm1, 16
- paddd mm0, mm2
- paddd mm1, mm3
-
- paddd mm0, mm6
- paddd mm1, mm6
- psrad mm0, mm5
- psrad mm1, mm5
-
- packssdw mm0, mm1
- movq %1, mm0
+;;; m4 f
+;;; m5 -i_qbits
+;;; m6 1
+;;; m7 0
+
+ movq m0, %1
+ movq m1, m0
+ movq m2, %2
+ movq m3, %3
+ punpcklwd m0, m4
+ punpckhwd m1, m4
+ por m2, m6 ; FIXME munge precomputed arrays?
+ por m3, m6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ psrad m0, m5
+ psrad m1, m5
+ packssdw m0, m1
+ movq %1, m0
%endmacro
-%macro DEQUANT_LOOP 2
- mov t0d, 8*(%2-2)
+%macro DEQUANT_LOOP 3
+%if 8*(%2-2*%3)
+ mov t0d, 8*(%2-2*%3)
%%loop:
- %1 [r0+t0+8], [r1+t0*2+16], [r1+t0*2+24]
- %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8]
- sub t0d, 16
- jge %%loop
+ %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
+ %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ sub t0d, 16*%3
+ jge %%loop
rep ret
+%else
+ %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
+ %1 [r0 ], [r1 ], [r1+ 8*%3]
+ ret
+%endif
%endmacro
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
-%macro DEQUANT_WxH 3
+%macro DEQUANT_WxH 4
cglobal %1, 0,3
%ifdef ARCH_X86_64
%define t0 r4
%define t0d r4d
imul r4d, r2d, 0x2b
shr r4d, 8 ; i_qbits = i_qp / 6
- lea r3d, [r4d*3]
- sub r2, r3
- sub r2, r3 ; i_mf = i_qp % 6
- shl r2, %3+2
+ lea r3, [r4*3]
+ sub r2d, r3d
+ sub r2d, r3d ; i_mf = i_qp % 6
+ shl r2d, %3+2
add r1, r2 ; dequant_mf[i_mf]
%else
%define t0 r2
@@ -222,19 +221,27 @@ cglobal %1, 0,3
jl .rshift32 ; negative qbits => rightshift
.lshift:
- movd mm5, t0d
- DEQUANT_LOOP DEQUANT16_L_1x4, %2
+ movd m5, t0d
+ DEQUANT_LOOP DEQUANT16_L, %2, %4
.rshift32:
neg t0d
- movd mm5, t0d
+ movd m5, t0d
picgetgot t0d
- movq mm6, [pd_1 GLOBAL]
- pxor mm7, mm7
- pslld mm6, mm5
- psrld mm6, 1
- DEQUANT_LOOP DEQUANT32_R_1x4, %2
+ movq m4, [pw_1 GLOBAL]
+ movq m6, [pd_10000 GLOBAL]
+ psllw m4, m5
+ pxor m7, m7
+ psrlw m4, 1
+ DEQUANT_LOOP DEQUANT32_R, %2, %4
%endmacro
-DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
-DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
+%ifndef ARCH_X86_64
+INIT_MMX
+DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4, 1
+DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6, 1
+%endif
+INIT_XMM
+DEQUANT_WxH x264_dequant_4x4_sse2, 4, 4, 2
+DEQUANT_WxH x264_dequant_8x8_sse2, 16, 6, 2
+
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 8532fde..587286c 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -35,5 +35,7 @@ void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16]
void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
#endif
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index b8fbe13..256aa2e 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -328,3 +328,31 @@ SECTION ".note.GNU-stack" noalloc noexec nowrite progbits
%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
+%macro INIT_MMX 0
+ %undef movq
+ %define m0 mm0
+ %define m1 mm1
+ %define m2 mm2
+ %define m3 mm3
+ %define m4 mm4
+ %define m5 mm5
+ %define m6 mm6
+ %define m7 mm7
+ %undef m8
+ %undef m9
+%endmacro
+
+%macro INIT_XMM 0
+ %define movq movdqa
+ %define m0 xmm0
+ %define m1 xmm1
+ %define m2 xmm2
+ %define m3 xmm3
+ %define m4 xmm4
+ %define m5 xmm5
+ %define m6 xmm6
+ %define m7 xmm7
+ %define m8 xmm8
+ %define m9 xmm9
+%endmacro
+
More information about the x264-devel
mailing list