[x264-devel] commit: MMX version of 8x8 interlaced zigzag (Cleo Saulnier )

git version control git at videolan.org
Wed Dec 9 13:57:27 CET 2009


x264 | branch: master | Cleo Saulnier <cleosaulnier at yahoo.com> | Mon Dec  7 12:40:14 2009 -0800| [77fc51070b3c916b3b16888e8ead0f3cb3e09eaa] | committer: Jason Garrett-Glaser 

MMX version of 8x8 interlaced zigzag
Just as fast as SSSE3 on Nehalem (and faster on Conroe/Penryn), so remove the SSSE3 version.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=77fc51070b3c916b3b16888e8ead0f3cb3e09eaa
---

 common/dct.c         |    4 +-
 common/x86/dct-a.asm |  126 ++++++++++++++++++++++++++++++--------------------
 common/x86/dct.h     |    2 +-
 3 files changed, 80 insertions(+), 52 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 725ebfb..aa83ef4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -720,12 +720,14 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 #ifdef HAVE_MMX
         if( cpu&X264_CPU_MMXEXT )
+        {
             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
+            pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
+        }
         if( cpu&X264_CPU_SSSE3 )
         {
             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
-            pf->scan_8x8 = x264_zigzag_scan_8x8_field_ssse3;
         }
 #endif
 
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index c56bcdc..d1fe271 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -866,56 +866,82 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
 ; 45 46 47 51 56 57 52 53
 ; 54 55 58 59 60 61 62 63
 
-cglobal x264_zigzag_scan_8x8_field_ssse3, 2,4,8
-    movdqa xmm0, [r1+ 0]                   ;  0  1  2  3  4  5  6  7
-    movdqu xmm1, [r1+10]                   ;  5  6  7  8  9 10 11 12
-    movdqu xmm3, [r1+26]                   ; 13 14 15 16 17 18 19 20
-    movdqu xmm4, [r1+40]                   ; 20 21 22 23 24 25 26 27
-    movdqu xmm5, [r1+56]                   ; 28 29 30 31 32 33 34 35
-    movdqu xmm6, [r1+72]                   ; 36 37 38 39 40 41 42 43
-    movdqa xmm2, xmm1
-    movdqa xmm7, [pb_scan8fielde GLOBAL]
-    pshufb xmm0, [pb_scan8fielda GLOBAL]   ;  0  1  2  _  _  3  4  _
-    pshufb xmm1, [pb_scan8fieldb GLOBAL]   ;  _  _  _  8  9  _  _ 10
-    por    xmm0, xmm1
-    pshufb xmm2, [pb_scan8fieldc GLOBAL]   ;  _ 11  5  6  7 12  _  _
-    pshufb xmm3, [pb_scan8fieldd GLOBAL]   ; 18 13 14 15 19  _  _  _
-    pshufb xmm4, xmm7                      ; 26 20 21 22 23 27  _  _
-    pshufb xmm5, xmm7                      ; 34 28 29 30 31 35  _  _
-    pshufb xmm6, xmm7                      ; 42 36 37 38 39 43  _  _
-    movdqa [r0+ 0], xmm0
-    movdqa [r0+16], xmm2
-    movdqa [r0+32], xmm3
-    movdqu [r0+46], xmm4
-    movdqu [r0+62], xmm5
-    movdqu [r0+78], xmm6
-    movdqu xmm0, [r1+88]                   ; 44 45 46 47 48 49 50 51
-    movdqu xmm1, [r1+104]                  ; 52 53 54 55 56 57 58 59
-    movq   xmm2, [r1+120]                  ; 60 61 62 63
-    pshufb xmm0, [pb_scan8fieldf GLOBAL]   ; 49 50 44 45 46 47 51 _
-    pshufb xmm1, [pb_scan8fieldg GLOBAL]   ; 56 57 52 53 54 55 58 59
-    movdqu [r0+90], xmm0
-    movdqu [r0+104], xmm1
-    movq   [r0+120], xmm2
-
-    mov     r2w, [r1+32]
-    mov     r3w, [r1+34]
-    mov [r0+16], r2w
-    mov [r0+28], r3w
-    mov     r2w, [r1+48]
-    mov     r3w, [r1+50]
-    mov [r0+30], r2w
-    mov [r0+42], r3w
-    mov     r2w, [r1+64]
-    mov     r3w, [r1+66]
-    mov [r0+44], r2w
-    mov [r0+58], r3w
-    mov     r2w, [r1+80]
-    mov     r3w, [r1+82]
-    mov     r1w, [r1+96]
-    mov [r0+60], r2w
-    mov [r0+74], r3w
-    mov [r0+76], r1w
+cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+    movq       mm0, [r1+2*0]        ; 03 02 01 00
+    movq       mm1, [r1+2*4]        ; 07 06 05 04
+    movq       mm2, [r1+2*8]        ; 11 10 09 08
+    pshufw     mm3, mm0, 011111111b ; 03 03 03 03
+    movd        r2, mm2             ; 09 08
+    pshufw     mm2, mm2, 000111001b ; 08 11 10 09
+    punpcklwd  mm3, mm1             ; 05 03 04 03
+    pinsrw     mm0, r2, 3           ; 08 02 01 00
+    movq       mm4, mm2
+    punpcklwd  mm2, mm3             ; 04 10 03 09
+    pshufw     mm2, mm2, 010110100b ; 10 04 03 09
+    movq  [r0+2*0], mm0             ; 08 02 01 00
+    movq  [r0+2*4], mm2             ; 10 04 03 09
+    movq       mm3, [r1+2*12]       ; 15 14 13 12
+    movq       mm5, [r1+2*16]       ; 19 18 17 16
+    punpckldq  mm6, mm5             ; 17 16 XX XX
+    psrlq      mm1, 16              ; XX 07 06 05
+    punpckhwd  mm6, mm4             ; 08 17 11 16
+    punpckldq  mm6, mm1             ; 06 05 11 16
+    movq  [r0+2*8], mm6             ; 06 05 11 16
+    psrlq      mm1, 16              ; XX XX 07 06
+    punpcklwd  mm1, mm5             ; 17 07 16 06
+    movq       mm0, [r1+2*20]       ; 23 22 21 20
+    movq       mm2, [r1+2*24]       ; 27 26 25 24
+    movq       mm6, mm3
+    punpckhdq  mm1, mm1             ; 17 07 17 07
+    punpcklwd  mm6, mm2             ; 25 13 24 12
+    pextrw      r2, mm5, 2
+    movq [r0+2*24], mm0             ; 23 22 21 20
+    punpcklwd  mm1, mm6             ; 24 17 12 07
+    movq [r0+2*12], mm1
+    pinsrw     mm3, r2, 0           ; 15 14 13 18
+    movq [r0+2*16], mm3             ; 15 14 13 18
+    movq       mm7, [r1+2*28]
+    movq       mm0, [r1+2*32]       ; 35 34 33 32
+    psrlq      mm5, 48              ; XX XX XX 19
+    pshufw     mm1, mm2, 011111001b ; 27 27 26 25
+    punpcklwd  mm5, mm0             ; 33 XX 32 19
+    psrlq      mm2, 48              ; XX XX XX 27
+    punpcklwd  mm5, mm1             ; 26 32 25 19
+    movq [r0+2*32], mm7
+    movq [r0+2*20], mm5             ; 26 32 25 19
+    movq       mm7, [r1+2*36]
+    movq       mm1, [r1+2*40]       ; 43 42 41 40
+    pshufw     mm3, mm0, 011111001b ; 35 35 34 33
+    punpcklwd  mm2, mm1             ; 41 XX 40 27
+    movq [r0+2*40], mm7
+    punpcklwd  mm2, mm3             ; 34 40 33 27
+    movq [r0+2*28], mm2
+    movq       mm7, [r1+2*44]       ; 47 46 45 44
+    movq       mm2, [r1+2*48]       ; 51 50 49 48
+    psrlq      mm0, 48              ; XX XX XX 35
+    punpcklwd  mm0, mm2             ; 49 XX 48 35
+    pshufw     mm3, mm1, 011111001b ; 43 43 42 41
+    punpcklwd  mm0, mm3             ; 42 48 41 35
+    movq [r0+2*36], mm0
+    pextrw      r2, mm2, 3          ; 51
+    psrlq      mm1, 48              ; XX XX XX 43
+    punpcklwd  mm1, mm7             ; 45 XX 44 43
+    psrlq      mm2, 16              ; XX 51 50 49
+    punpcklwd  mm1, mm2             ; 50 44 49 43
+    pshufw     mm1, mm1, 010110100b ; 44 50 49 43
+    movq [r0+2*44], mm1
+    psrlq      mm7, 16              ; XX 47 46 45
+    pinsrw     mm7, r2, 3           ; 51 47 46 45
+    movq [r0+2*48], mm7
+    movq       mm0, [r1+2*56]       ; 59 58 57 56
+    movq       mm1, [r1+2*52]       ; 55 54 53 52
+    movq       mm2, mm0
+    movq       mm7, [r1+2*60]
+    punpckldq  mm2, mm1             ; 53 52 57 56
+    punpckhdq  mm1, mm0             ; 59 58 55 54
+    movq [r0+2*52], mm2
+    movq [r0+2*56], mm1
+    movq [r0+2*60], mm7
     RET
 
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 6ff20ad..a8f46ca 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -69,7 +69,7 @@ void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
 void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_8x8_field_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
 int  x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
 int  x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
 int  x264_zigzag_sub_4x4_field_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );



More information about the x264-devel mailing list