[x264-devel] x86: Faster pixel_ssd_nv12

Henrik Gramner git at videolan.org
Tue Sep 20 20:57:52 CEST 2016


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Sep 17 14:45:08 2016 +0200| [9521b278adb92081f052c1b7bfc4b95651d88b07] | committer: Anton Mitrofanov

x86: Faster pixel_ssd_nv12

Also drop the MMX2 version to simplify things.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9521b278adb92081f052c1b7bfc4b95651d88b07
---

 common/pixel.c         |   2 -
 common/x86/pixel-a.asm | 109 ++++++++++++++++++++-----------------------------
 common/x86/pixel.h     |   3 --
 3 files changed, 45 insertions(+), 69 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 3963af7..4db4b46 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -885,7 +885,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT8( ssd, _mmx2 );
         INIT_ADS( _mmx2 );
 
-        pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
 #if ARCH_X86
@@ -1071,7 +1070,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
         pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_mmx2;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_mmx2;
 #if ARCH_X86
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index cf60618..cff22ee 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -43,6 +43,9 @@ mask_ff:   times 16 db 0xff
 mask_ac4:  times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
 mask_ac8:  times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
+%if HIGH_BIT_DEPTH
+ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+%endif
 %if BIT_DEPTH == 10
 ssim_c1:   times 4 dd 6697.7856    ; .01*.01*1023*1023*64
 ssim_c2:   times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -531,8 +534,8 @@ SSD 16,  8
 ;
 ;   2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
 ;
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
-; distortion levels it will take much more than that though.
+; For 10-bit XMM this means width >= 32832. At sane distortion levels
+; it will take much more than that though.
 ;-----------------------------------------------------------------------------
 %if HIGH_BIT_DEPTH
 %macro SSD_NV12 0
@@ -541,13 +544,14 @@ cglobal pixel_ssd_nv12_core, 6,7,7
     FIX_STRIDES r1, r3
     add         r0, r4
     add         r2, r4
-    xor         r6, r6
+    neg         r4
     pxor        m4, m4
     pxor        m5, m5
-    pxor        m6, m6
+%if mmsize == 32
+    vbroadcasti128 m6, [ssd_nv12_shuf]
+%endif
 .loopy:
     mov         r6, r4
-    neg         r6
     pxor        m2, m2
     pxor        m3, m3
 .loopx:
@@ -555,11 +559,11 @@ cglobal pixel_ssd_nv12_core, 6,7,7
     mova        m1, [r0+r6+mmsize]
     psubw       m0, [r2+r6]
     psubw       m1, [r2+r6+mmsize]
-    PSHUFLW     m0, m0, q3120
-    PSHUFLW     m1, m1, q3120
-%if mmsize >= 16
-    pshufhw     m0, m0, q3120
-    pshufhw     m1, m1, q3120
+%if mmsize == 32
+    pshufb      m0, m6
+    pshufb      m1, m6
+%else
+    SBUTTERFLY wd, 0, 1, 6
 %endif
 %if cpuflag(xop)
     pmadcswd    m2, m0, m0, m2
@@ -577,59 +581,30 @@ cglobal pixel_ssd_nv12_core, 6,7,7
     psubd       m3, m1
 .no_overread:
 %endif
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
-                 ; equation above, putting the width limit at 8208
-    punpckhdq   m0, m2, m6
-    punpckhdq   m1, m3, m6
-    punpckldq   m2, m6
-    punpckldq   m3, m6
-    paddq       m3, m2
-    paddq       m1, m0
-    paddq       m4, m3
-    paddq       m4, m1
-%else ; unfortunately paddq is sse2
-      ; emulate 48 bit precision for mmx2 instead
-    mova        m0, m2
-    mova        m1, m3
-    punpcklwd   m2, m6
-    punpcklwd   m3, m6
-    punpckhwd   m0, m6
-    punpckhwd   m1, m6
-    paddd       m3, m2
-    paddd       m1, m0
-    paddd       m4, m3
-    paddd       m5, m1
-%endif
+    punpckhdq   m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
+    punpckhdq   m1, m3, m5 ; equation above, putting the width limit at 8208
+    punpckldq   m2, m5
+    punpckldq   m3, m5
+    paddq       m0, m1
+    paddq       m2, m3
+    paddq       m4, m0
+    paddq       m4, m2
     add         r0, r1
     add         r2, r3
     dec        r5d
     jg .loopy
-    mov         r3, r6m
-    mov         r4, r7m
+    mov         r0, r6m
+    mov         r1, r7m
 %if mmsize == 32
     vextracti128 xm0, m4, 1
     paddq      xm4, xm0
 %endif
-%if mmsize >= 16
-    movq      [r3], xm4
-    movhps    [r4], xm4
-%else ; fixup for mmx2
-    SBUTTERFLY dq, 4, 5, 0
-    mova        m0, m4
-    psrld       m4, 16
-    paddd       m5, m4
-    pslld       m0, 16
-    SBUTTERFLY dq, 0, 5, 4
-    psrlq       m0, 16
-    psrlq       m5, 16
-    movq      [r3], m0
-    movq      [r4], m5
-%endif
+    movq      [r0], xm4
+    movhps    [r1], xm4
     RET
 %endmacro ; SSD_NV12
-%endif ; HIGH_BIT_DEPTH
 
-%if HIGH_BIT_DEPTH == 0
+%else ; !HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
 ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
@@ -643,12 +618,12 @@ cglobal pixel_ssd_nv12_core, 6,7
     add    r4d, r4d
     add     r0, r4
     add     r2, r4
+    neg     r4
     pxor    m3, m3
     pxor    m4, m4
     mova    m5, [pw_00ff]
 .loopy:
     mov     r6, r4
-    neg     r6
 .loopx:
 %if mmsize == 32 ; only 16-byte alignment is guaranteed
     movu    m2, [r0+r6]
@@ -686,21 +661,27 @@ cglobal pixel_ssd_nv12_core, 6,7
     add     r2, r3
     dec    r5d
     jg .loopy
-    mov     r3, r6m
-    mov     r4, r7m
-    HADDD   m3, m0
-    HADDD   m4, m0
-    pxor   xm0, xm0
-    punpckldq xm3, xm0
-    punpckldq xm4, xm0
-    movq  [r3], xm3
-    movq  [r4], xm4
+    mov     r0, r6m
+    mov     r1, r7m
+%if cpuflag(ssse3)
+    phaddd  m3, m4
+%else
+    SBUTTERFLY qdq, 3, 4, 0
+    paddd   m3, m4
+%endif
+%if mmsize == 32
+    vextracti128 xm4, m3, 1
+    paddd  xm3, xm4
+%endif
+    psllq  xm4, xm3, 32
+    paddd  xm3, xm4
+    psrlq  xm3, 32
+    movq  [r0], xm3
+    movhps [r1], xm3
     RET
 %endmacro ; SSD_NV12
 %endif ; !HIGH_BIT_DEPTH
 
-INIT_MMX mmx2
-SSD_NV12
 INIT_XMM sse2
 SSD_NV12
 INIT_XMM avx
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 7f57788..a8ed389 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -145,9 +145,6 @@ int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u
 int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_avx2  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 
-void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
-                                    pixel *pixuv2, intptr_t stride2, int width,
-                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );



More information about the x264-devel mailing list