[x264-devel] x86: AVX2 add16x16_idct_dc
Jason Garrett-Glaser
git at videolan.org
Mon May 20 23:06:49 CEST 2013
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Wed May 1 14:32:11 2013 -0700| [327386f70836507cb44266e5d71bd1d744fe3d78] | committer: Jason Garrett-Glaser
x86: AVX2 add16x16_idct_dc
27 -> 19 cycles
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=327386f70836507cb44266e5d71bd1d744fe3d78
---
common/dct.c | 1 +
common/x86/dct-a.asm | 50 +++++++++++++++++++++++++++++++++++++++++++-------
common/x86/dct.h | 1 +
3 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index e0219ec..52ef9be 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -697,6 +697,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct = x264_add16x16_idct_avx2;
dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
#if ARCH_X86_64
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
#endif
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 52b287f..a3e2ce6 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -30,7 +30,9 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+pb_idctdc_unpack: times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+pb_idctdc_unpack2: times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
@@ -39,8 +41,6 @@ pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
-pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
-pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
@@ -74,6 +74,7 @@ SECTION .text
cextern pw_32_0
cextern pw_32
+cextern pw_512
cextern pw_8000
cextern pw_pixel_max
cextern hsub_mul
@@ -738,8 +739,7 @@ cglobal add8x8_idct_dc, 2,2
movh m0, [r1]
pxor m1, m1
add r0, FDEC_STRIDE*4
- paddw m0, [pw_32]
- psraw m0, 6
+ pmulhrsw m0, [pw_512]
psubw m1, m0
mova m5, [pb_idctdc_unpack]
packuswb m0, m0
@@ -836,8 +836,7 @@ cglobal add16x16_idct_dc, 2,2,8
mova m0, [r1]
add r1, 16
pxor m1, m1
- paddw m0, [pw_32]
- psraw m0, 6
+ pmulhrsw m0, [pw_512]
psubw m1, m0
mova m5, [ pb_idctdc_unpack]
mova m6, [pb_idctdc_unpack2]
@@ -857,6 +856,43 @@ ADD16x16
INIT_XMM avx
ADD16x16
+%macro ADD_DC_AVX2 3
+ mova xm4, [r0+FDEC_STRIDE*0+%3]
+ mova xm5, [r0+FDEC_STRIDE*1+%3]
+ vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
+ vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m4, %2
+ psubusb m5, %2
+ mova [r0+FDEC_STRIDE*0+%3], xm4
+ mova [r0+FDEC_STRIDE*1+%3], xm5
+ vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
+ vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
+%endmacro
+
+INIT_YMM avx2
+cglobal add16x16_idct_dc, 2,3,6
+ add r0, FDEC_STRIDE*4
+ mova m0, [r1]
+ pxor m1, m1
+ pmulhrsw m0, [pw_512]
+ psubw m1, m0
+ mova m4, [pb_idctdc_unpack]
+ mova m5, [pb_idctdc_unpack2]
+ packuswb m0, m0
+ packuswb m1, m1
+ pshufb m2, m0, m4 ; row0, row2
+ pshufb m3, m1, m4 ; row0, row2
+ pshufb m0, m5 ; row1, row3
+ pshufb m1, m5 ; row1, row3
+ lea r2, [r0+FDEC_STRIDE*8]
+ ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
+ ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
+ ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
+ ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
+ RET
+
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 1595f5b..f5595b0 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -68,6 +68,7 @@ void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] );
void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] );
+void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_dct4x4dc_mmx ( int16_t d[16] );
void x264_dct4x4dc_sse2 ( int32_t d[16] );
More information about the x264-devel
mailing list