[x264-devel] x86: AVX2 high bit-depth denoise_dct
Henrik Gramner
git at videolan.org
Mon May 20 23:06:50 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Mon May 6 18:41:24 2013 +0200| [26a6451591cd7cd25fcfeeacee3850e5dd7a7f7e] | committer: Jason Garrett-Glaser
x86: AVX2 high bit-depth denoise_dct
28->15 cycles
Also reorder instructions to use fewer registers, 3 cycles faster on Ivy Bridge with 64-bit Windows.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=26a6451591cd7cd25fcfeeacee3850e5dd7a7f7e
---
common/quant.c | 1 +
common/x86/quant-a.asm | 26 +++++++++++++-------------
2 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index d4fd405..57151bb 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -543,6 +543,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
pf->quant_8x8 = x264_quant_8x8_avx2;
pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+ pf->denoise_dct = x264_denoise_dct_avx2;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 3f7e9b3..d77c282 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -946,31 +946,29 @@ OPTIMIZE_CHROMA_2x2_DC
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,8
- pxor m6, m6
+cglobal denoise_dct, 4,4,6
+ pxor m5, m5
movsxdifnidn r3, r3d
.loop:
mova m2, [r0+r3*4-2*mmsize]
mova m3, [r0+r3*4-1*mmsize]
ABSD m0, m2
ABSD m1, m3
- mova m4, m0
- mova m5, m1
+ paddd m4, m0, [r1+r3*4-2*mmsize]
psubd m0, [r2+r3*4-2*mmsize]
+ mova [r1+r3*4-2*mmsize], m4
+ paddd m4, m1, [r1+r3*4-1*mmsize]
psubd m1, [r2+r3*4-1*mmsize]
- pcmpgtd m7, m0, m6
- pand m0, m7
- pcmpgtd m7, m1, m6
- pand m1, m7
+ mova [r1+r3*4-1*mmsize], m4
+ pcmpgtd m4, m0, m5
+ pand m0, m4
+ pcmpgtd m4, m1, m5
+ pand m1, m4
PSIGND m0, m2
PSIGND m1, m3
mova [r0+r3*4-2*mmsize], m0
mova [r0+r3*4-1*mmsize], m1
- paddd m4, [r1+r3*4-2*mmsize]
- paddd m5, [r1+r3*4-1*mmsize]
- mova [r1+r3*4-2*mmsize], m4
- mova [r1+r3*4-1*mmsize], m5
- sub r3, mmsize/2
+ sub r3d, mmsize/2
jg .loop
RET
%endmacro
@@ -985,6 +983,8 @@ INIT_XMM ssse3
DENOISE_DCT
INIT_XMM avx
DENOISE_DCT
+INIT_YMM avx2
+DENOISE_DCT
%else ; !HIGH_BIT_DEPTH
More information about the x264-devel
mailing list