[x264-devel] x86: AVX2 high bit-depth dequant
Henrik Gramner
git at videolan.org
Mon May 20 23:06:50 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue May 14 18:53:12 2013 +0200| [907573d3f7873b7600cc94d1e287d52628e11766] | committer: Jason Garrett-Glaser
x86: AVX2 high bit-depth dequant
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=907573d3f7873b7600cc94d1e287d52628e11766
---
common/quant.c | 2 ++
common/x86/quant-a.asm | 70 +++++++++++++++++++++++++++---------------------
2 files changed, 41 insertions(+), 31 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index 57151bb..5d37f07 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -543,6 +543,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
pf->quant_8x8 = x264_quant_8x8_avx2;
pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+ pf->dequant_4x4 = x264_dequant_4x4_avx2;
+ pf->dequant_8x8 = x264_dequant_8x8_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
}
#endif // HAVE_MMX
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d77c282..8a7f8f9 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -524,19 +524,25 @@ cglobal quant_4x4x4, 3,3,6
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
- mova m0, %2
%if HIGH_BIT_DEPTH
- pmaddwd m0, %1
- pslld m0, m2
+ mova m0, %1
+ mova m1, %4
+ pmaddwd m0, %2
+ pmaddwd m1, %3
+ pslld m0, xm2
+ pslld m1, xm2
+ mova %1, m0
+ mova %4, m1
%else
+ mova m0, %2
packssdw m0, %3
%if mmsize==32
vpermq m0, m0, q3120
%endif
pmullw m0, %1
psllw m0, xm2
-%endif
mova %1, m0
+%endif
%endmacro
%macro DEQUANT32_R 4
@@ -545,33 +551,34 @@ cglobal quant_4x4x4, 3,3,6
;;; m2 -i_qbits
;;; m3 f
;;; m4 0
-%if mmsize==32
- pmovzxwd m0, %1
- pmovzxwd m1, %4
- pmaddwd m0, %2
- pmaddwd m1, %3
- paddd m0, m3
- paddd m1, m3
+%if HIGH_BIT_DEPTH
+ mova m0, %1
+ mova m1, %4
+ pmadcswd m0, m0, %2, m3
+ pmadcswd m1, m1, %3, m3
psrad m0, xm2
psrad m1, xm2
- packssdw m0, m1
- vpermq m0, m0, q3120
+ mova %1, m0
+ mova %4, m1
%else
- mova m0, %1
-%if HIGH_BIT_DEPTH
- pmadcswd m0, m0, %2, m3
- psrad m0, m2
+%if mmsize == 32
+ pmovzxwd m0, %1
+ pmovzxwd m1, %4
%else
+ mova m0, %1
punpckhwd m1, m0, m4
punpcklwd m0, m4
+%endif
pmadcswd m0, m0, %2, m3
pmadcswd m1, m1, %3, m3
- psrad m0, m2
- psrad m1, m2
+ psrad m0, xm2
+ psrad m1, xm2
packssdw m0, m1
-%endif
+%if mmsize == 32
+ vpermq m0, m0, q3120
%endif
mova %1, m0
+%endif
%endmacro
%macro DEQUANT_LOOP 3
@@ -609,10 +616,8 @@ cglobal quant_4x4x4, 3,3,6
%endrep
%endmacro
-%if WIN64
+%if ARCH_X86_64
DECLARE_REG_TMP 6,3,2
-%elif ARCH_X86_64
- DECLARE_REG_TMP 4,3,2
%else
DECLARE_REG_TMP 2,0,1
%endif
@@ -621,8 +626,8 @@ cglobal quant_4x4x4, 3,3,6
movifnidn t2d, r2m
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1, [t0*3]
- sub t2d, t1d
+ lea t1d, [t0*5]
+ sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if ARCH_X86_64
@@ -666,8 +671,8 @@ cglobal dequant_%1x%1_flat16, 0,3
%endif
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1, [t0*3]
- sub t2d, t1d
+ lea t1d, [t0*5]
+ sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %2
%ifdef PIC
@@ -719,11 +724,14 @@ cglobal dequant_%1x%1_flat16, 0,3
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-DEQUANT 4, 4, 1
-DEQUANT 8, 6, 1
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
INIT_XMM xop
-DEQUANT 4, 4, 1
-DEQUANT 8, 6, 1
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
+INIT_YMM avx2
+DEQUANT 4, 4, 4
+DEQUANT 8, 6, 4
%else
%if ARCH_X86_64 == 0
INIT_MMX mmx
More information about the x264-devel
mailing list