[x264-devel] x86: faster SSSE3 hpel

Jason Garrett-Glaser git at videolan.org
Mon May 20 23:06:47 CEST 2013


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Apr 30 17:36:46 2013 -0700| [a9ed051f2bc73c9bfeff006d7328bd2bc99ce147] | committer: Jason Garrett-Glaser

x86: faster SSSE3 hpel

~7% faster using the pmulhrsw trick from mc_chroma.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a9ed051f2bc73c9bfeff006d7328bd2bc99ce147
---

 common/x86/const-a.asm |    1 +
 common/x86/mc-a.asm    |    2 +-
 common/x86/mc-a2.asm   |   97 +++++++++++++++++++++++++++++-------------------
 3 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index 3a7a942..e8428d8 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -33,6 +33,7 @@ const hsub_mul,    times 16 db 1, -1
 const pw_1,        times 16 dw 1
 const pw_16,       times 16 dw 16
 const pw_32,       times 16 dw 32
+const pw_512,      times 16 dw 512
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const pd_1,        times 8 dd 1
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 477fa21..7acc366 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -34,7 +34,6 @@
 
 SECTION_RODATA 32
 
-pw_512: times 16 dw 512
 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
 ch_shuf_adj: times 8 db 0
              times 8 db 2
@@ -50,6 +49,7 @@ cextern pw_4
 cextern pw_8
 cextern pw_32
 cextern pw_64
+cextern pw_512
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index f1f09d1..8c3bd46 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -44,6 +44,7 @@ deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
+pw_1024: times 16 dw 1024
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
@@ -64,6 +65,7 @@ cextern pb_0
 cextern pw_1
 cextern pw_16
 cextern pw_32
+cextern pw_512
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
@@ -127,19 +129,24 @@ cextern pd_ffff
     paddw  %4, %6
 %endmacro
 
-%macro FILT_PACK 4-6 b
-    paddw      %1, %4
-    paddw      %2, %4
-%if %0 == 6
-    psubusw    %1, %6
-    psubusw    %2, %6
-    psrlw      %1, %3
-    psrlw      %2, %3
+%macro FILT_PACK 3-5
+%if cpuflag(ssse3)
+    pmulhrsw %1, %3
+    pmulhrsw %2, %3
+%else
+    paddw    %1, %3
+    paddw    %2, %3
+%if %0 == 5
+    psubusw  %1, %5
+    psubusw  %2, %5
+    psrlw    %1, %4
+    psrlw    %2, %4
 %else
-    psraw      %1, %3
-    psraw      %2, %3
+    psraw    %1, %4
+    psraw    %2, %4
 %endif
-%ifnidn w, %5
+%endif
+%if HIGH_BIT_DEPTH == 0
     packuswb %1, %2
 %endif
 %endmacro
@@ -203,7 +210,7 @@ cglobal hpel_filter_v, 5,6,11
     mova      [r2+r4+mmsize], m4
     paddw      m1, s30
     paddw      m4, s30
-    FILT_PACK  m1, m4, 5, m6, w, s10
+    FILT_PACK  m1, m4, m6, 5, s10
     CLIPW      m1, m0, m7
     CLIPW      m4, m0, m7
     mova      [r0+r4], m1
@@ -295,7 +302,7 @@ cglobal hpel_filter_h, 3,4,8
     FILT_H2    m1, m2, m3, m4, m5, m6
     mova       m7, [pw_1]
     pxor       m2, m2
-    FILT_PACK  m1, m4, 1, m7, w
+    FILT_PACK  m1, m4, m7, 1
     CLIPW      m1, m2, m0
     CLIPW      m4, m2, m0
     mova      [r0+r2], m1
@@ -349,14 +356,15 @@ cglobal hpel_filter_v, 5,6,%1
     paddw  m4, m5
     paddw  m1, m3
     paddw  m4, m6
+    mova   m7, [pw_1024]
 %else
     LOAD_ADD_2 m1, m4, [r1     ], [r5+r3*2], m6, m7            ; a0 / a1
     LOAD_ADD_2 m2, m5, [r1+r3  ], [r5+r3  ], m6, m7            ; b0 / b1
     LOAD_ADD   m3,     [r1+r3*2], [r5     ], m7                ; c0
     LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7   ; c1
     FILT_V2 m1, m2, m3, m4, m5, m6
+    mova   m7, [pw_16]
 %endif
-    mova      m7, [pw_16]
 %if mmsize==32
     mova         [r2+r4*2], xm1
     mova         [r2+r4*2+mmsize/2], xm4
@@ -366,7 +374,7 @@ cglobal hpel_filter_v, 5,6,%1
     mova      [r2+r4*2], m1
     mova      [r2+r4*2+mmsize], m4
 %endif
-    FILT_PACK m1, m4, 5, m7
+    FILT_PACK m1, m4, m7, 5
     movnta    [r0+r4], m1
     add r1, mmsize
     add r5, mmsize
@@ -378,8 +386,8 @@ cglobal hpel_filter_v, 5,6,%1
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal hpel_filter_c_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_c, 3,3
     add r0, r2
     lea r1, [r1+r2*2]
     neg r2
@@ -399,7 +407,7 @@ cglobal hpel_filter_c_mmx2, 3,3
     paddw  m5, [src+12] ; b1
     paddw  m6, [src+10] ; c1
     FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, 6, m7
+    FILT_PACK m1, m4, m7, 6
     movntq [r0+r2], m1
     add r2, 8
     jl .loop
@@ -408,7 +416,8 @@ cglobal hpel_filter_c_mmx2, 3,3
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_mmx2, 3,3
+INIT_MMX mmx2
+cglobal hpel_filter_h, 3,3
     add r0, r2
     add r1, r2
     neg r2
@@ -443,14 +452,12 @@ cglobal hpel_filter_h_mmx2, 3,3
     paddw      m6, m7 ; a1
     movq       m7, [pw_1]
     FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, 1, m7
+    FILT_PACK m1, m4, m7, 1
     movntq     [r0+r2], m1
     add r2, 8
     jl .loop
     RET
 
-INIT_XMM
-
 %macro HPEL_C 0
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
@@ -461,13 +468,17 @@ cglobal hpel_filter_c, 3,3,9
     neg r2
     %define src r1+r2*2
 %ifnidn cpuname, sse2
+%if cpuflag(ssse3)
+    mova    m7, [pw_512]
+%else
     mova    m7, [pw_32]
-    %define tpw_32 m7
+%endif
+    %define pw_rnd m7
 %elif ARCH_X86_64
     mova    m8, [pw_32]
-    %define tpw_32 m8
+    %define pw_rnd m8
 %else
-    %define tpw_32 [pw_32]
+    %define pw_rnd [pw_32]
 %endif
 ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
 %if cpuflag(misalign) || mmsize==32
@@ -513,7 +524,7 @@ cglobal hpel_filter_c, 3,3,9
     paddw     m6, m0
     FILT_H    m3, m5, m6
 %endif
-    FILT_PACK m4, m3, 6, tpw_32
+    FILT_PACK m4, m3, pw_rnd, 6
 %if mmsize==32
     vpermq    m4, m4, q3120
 %endif
@@ -526,7 +537,8 @@ cglobal hpel_filter_c, 3,3,9
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_sse2, 3,3,8
+INIT_XMM sse2
+cglobal hpel_filter_h, 3,3,8
     add r0, r2
     add r1, r2
     neg r2
@@ -565,7 +577,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
     paddw      m6, m7 ; c1
     mova       m7, [pw_1] ; FIXME xmm8
     FILT_H2 m1, m2, m3, m4, m5, m6
-    FILT_PACK m1, m4, 1, m7
+    FILT_PACK m1, m4, m7, 1
     movntps    [r0+r2], m1
     add r2, 16
     jl .loop
@@ -582,7 +594,7 @@ cglobal hpel_filter_h, 3,3
     %define src r1+r2
     mova      m0, [src-16]
     mova      m1, [src]
-    mova      m7, [pw_16]
+    mova      m7, [pw_1024]
 .loop:
     mova      m2, [src+16]
     ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
@@ -604,7 +616,7 @@ cglobal hpel_filter_h, 3,3
     paddw     m3, m1
     paddw     m4, m5
     paddw     m4, m6
-    FILT_PACK m3, m4, 5, m7
+    FILT_PACK m3, m4, m7, 5
     pshufb    m3, [hpel_shuf]
     mova      m1, m2
     movntps [r0+r2], m3
@@ -663,8 +675,8 @@ cglobal hpel_filter_h, 3,3,8
     paddw     m1, m3
     paddw     m1, m4
 
-    mova      m2, [pw_16]
-    FILT_PACK m0, m1, 5, m2
+    mova      m2, [pw_1024]
+    FILT_PACK m0, m1, m2, 5
     pshufb    m0, [hpel_shuf]
     movnta [r0+r2], m0
     add       r2, mmsize
@@ -715,7 +727,7 @@ cglobal hpel_filter_h, 3,3,8
     add       r1, 16
     mova      %1, m1
     mova      %2, m4
-    FILT_PACK m1, m4, 5, m15
+    FILT_PACK m1, m4, m15, 5
     movntps  [r8+r4+%5], m1
 %endmacro
 
@@ -735,7 +747,7 @@ cglobal hpel_filter_h, 3,3,8
 %macro DO_FILT_C 4
     FILT_C %1, %2, %3, 6
     FILT_C %2, %1, %4, 6
-    FILT_PACK %3, %4, 6, m15
+    FILT_PACK %3, %4, m15, 6
     movntps   [r5+r4], %3
 %endmacro
 
@@ -766,14 +778,14 @@ cglobal hpel_filter_h, 3,3,8
     paddw     m2, m4
     paddw     m1, m5
     paddw     m2, m6
-    FILT_PACK m1, m2, 5, m15
+    FILT_PACK m1, m2, m15, 5
     pshufb    m1, [hpel_shuf]
 %else ; ssse3, avx
     ADD8TO16  m1, m6, m12, m3, m0 ; a
     ADD8TO16  m2, m5, m12, m3, m0 ; b
     ADD8TO16  %2, m4, m12, m3, m0 ; c
     FILT_V2   m1, m2, %2, m6, m5, m4
-    FILT_PACK m1, m6, 5, m15
+    FILT_PACK m1, m6, m15, 5
 %endif
     movntps [r0+r4], m1
     mova      %2, %3
@@ -800,13 +812,14 @@ cglobal hpel_filter, 7,9,16
     sub       r3, r2
     sub       r3, r2
     mov       r4, r7
-    mova     m15, [pw_16]
 %if cpuflag(ssse3)
     mova      m0, [filt_mul51]
     mova     m12, [filt_mul15]
     mova     m14, [filt_mul20]
+    mova     m15, [pw_1024]
 %else
     pxor      m0, m0
+    mova     m15, [pw_16]
 %endif
 ;ALIGN 16
 .loopy:
@@ -816,9 +829,17 @@ cglobal hpel_filter, 7,9,16
 .loopx:
     DO_FILT_V m6, m5, m11, m12, 16
 .lastx:
+%if cpuflag(ssse3)
+    psrlw   m15, 1   ; pw_512
+%else
     paddw   m15, m15 ; pw_32
+%endif
     DO_FILT_C m9, m8, m7, m6
-    psrlw   m15, 1 ; pw_16
+%if cpuflag(ssse3)
+    paddw   m15, m15 ; pw_1024
+%else
+    psrlw   m15, 1   ; pw_16
+%endif
     movdqa   m7, m5
     DO_FILT_H m10, m13, m11
     add      r4, 16



More information about the x264-devel mailing list