[x264-devel] commit: SSSE3 version of zigzag_8x8_field (Jason Garrett-Glaser )
git version control
git at videolan.org
Sat Dec 5 11:01:52 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Dec 3 15:36:52 2009 -0800| [77d0631c35b7e6928759eda0b3d7d229b18f27c9] | committer: Jason Garrett-Glaser
SSSE3 version of zigzag_8x8_field
Slightly faster interlaced encoding with 8x8dct.
Helps most on Nehalem, somewhat disappointing on Conroe/Penryn.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=77d0631c35b7e6928759eda0b3d7d229b18f27c9
---
common/dct.c | 1 +
common/x86/dct-a.asm | 88 ++++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/dct.h | 1 +
3 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 245347b..725ebfb 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -725,6 +725,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
+ pf->scan_8x8 = x264_zigzag_scan_8x8_field_ssse3;
}
#endif
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index c4ebae5..c56bcdc 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -26,17 +26,35 @@
%include "x86inc.asm"
%include "x86util.asm"
+%macro SHUFFLE_16BIT 8
+ %rep 8
+ db %1*2
+ db %1*2+1
+ %rotate 1
+ %endrep
+%endmacro
+
SECTION_RODATA
pw_32_0: times 4 dw 32
times 4 dw 0
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
+
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
-pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
-pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
+pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
+
+pb_scan8fielda: SHUFFLE_16BIT 0,1,2,-1,-1,3,4,-1
+pb_scan8fieldb: SHUFFLE_16BIT -1,-1,-1,3,4,-1,-1,5
+pb_scan8fieldc: SHUFFLE_16BIT -1,6,0,1,2,7,-1,-1
+pb_scan8fieldd: SHUFFLE_16BIT 5,0,1,2,6,-1,-1,-1
+pb_scan8fielde: SHUFFLE_16BIT 6,0,1,2,3,7,-1,-1
+pb_scan8fieldf: SHUFFLE_16BIT 5,6,0,1,2,3,7,-1
+pb_scan8fieldg: SHUFFLE_16BIT 4,5,0,1,2,3,6,7
+
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 16 db 1
@@ -835,6 +853,72 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
RET
;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+
+; Output order:
+; 0 1 2 8 9 3 4 10
+; 16 11 5 6 7 12 17 24
+; 18 13 14 15 19 25 32 26
+; 20 21 22 23 27 33 40 34
+; 28 29 30 31 35 41 48 42
+; 36 37 38 39 43 49 50 44
+; 45 46 47 51 56 57 52 53
+; 54 55 58 59 60 61 62 63
+
+cglobal x264_zigzag_scan_8x8_field_ssse3, 2,4,8
+ movdqa xmm0, [r1+ 0] ; 0 1 2 3 4 5 6 7
+ movdqu xmm1, [r1+10] ; 5 6 7 8 9 10 11 12
+ movdqu xmm3, [r1+26] ; 13 14 15 16 17 18 19 20
+ movdqu xmm4, [r1+40] ; 20 21 22 23 24 25 26 27
+ movdqu xmm5, [r1+56] ; 28 29 30 31 32 33 34 35
+ movdqu xmm6, [r1+72] ; 36 37 38 39 40 41 42 43
+ movdqa xmm2, xmm1
+ movdqa xmm7, [pb_scan8fielde GLOBAL]
+ pshufb xmm0, [pb_scan8fielda GLOBAL] ; 0 1 2 _ _ 3 4 _
+ pshufb xmm1, [pb_scan8fieldb GLOBAL] ; _ _ _ 8 9 _ _ 10
+ por xmm0, xmm1
+ pshufb xmm2, [pb_scan8fieldc GLOBAL] ; _ 11 5 6 7 12 _ _
+ pshufb xmm3, [pb_scan8fieldd GLOBAL] ; 18 13 14 15 19 _ _ _
+ pshufb xmm4, xmm7 ; 26 20 21 22 23 27 _ _
+ pshufb xmm5, xmm7 ; 34 28 29 30 31 35 _ _
+ pshufb xmm6, xmm7 ; 42 36 37 38 39 43 _ _
+ movdqa [r0+ 0], xmm0
+ movdqa [r0+16], xmm2
+ movdqa [r0+32], xmm3
+ movdqu [r0+46], xmm4
+ movdqu [r0+62], xmm5
+ movdqu [r0+78], xmm6
+ movdqu xmm0, [r1+88] ; 44 45 46 47 48 49 50 51
+ movdqu xmm1, [r1+104] ; 52 53 54 55 56 57 58 59
+ movq xmm2, [r1+120] ; 60 61 62 63
+ pshufb xmm0, [pb_scan8fieldf GLOBAL] ; 49 50 44 45 46 47 51 _
+ pshufb xmm1, [pb_scan8fieldg GLOBAL] ; 56 57 52 53 54 55 58 59
+ movdqu [r0+90], xmm0
+ movdqu [r0+104], xmm1
+ movq [r0+120], xmm2
+
+ mov r2w, [r1+32]
+ mov r3w, [r1+34]
+ mov [r0+16], r2w
+ mov [r0+28], r3w
+ mov r2w, [r1+48]
+ mov r3w, [r1+50]
+ mov [r0+30], r2w
+ mov [r0+42], r3w
+ mov r2w, [r1+64]
+ mov r3w, [r1+66]
+ mov [r0+44], r2w
+ mov [r0+58], r3w
+ mov r2w, [r1+80]
+ mov r3w, [r1+82]
+ mov r1w, [r1+96]
+ mov [r0+60], r2w
+ mov [r0+74], r3w
+ mov [r0+76], r1w
+ RET
+
+;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
diff --git a/common/x86/dct.h b/common/x86/dct.h
index e537d62..6ff20ad 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -69,6 +69,7 @@ void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_field_ssse3 ( int16_t level[64], int16_t dct[64] );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
More information about the x264-devel
mailing list