[x264-devel] Clean up and optimize weightp, plus enable SSSE3 weight on SB/BDZ

Jason Garrett-Glaser git at videolan.org
Sat Feb 4 21:10:52 CET 2012


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Jan 24 19:03:58 2012 -0800| [56ba096141d16ffcbabd805e2d27014f62f0d722] | committer: Jason Garrett-Glaser

Clean up and optimize weightp, plus enable SSSE3 weight on SB/BDZ
Also remove unused AVX cruft.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=56ba096141d16ffcbabd805e2d27014f62f0d722
---

 common/x86/mc-a.asm |  199 +++++++++++++++++++--------------------------------
 common/x86/mc-c.c   |    9 +--
 2 files changed, 79 insertions(+), 129 deletions(-)

diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index dd3789d..3743b4d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -252,7 +252,8 @@ AVG_WEIGHT 16, 7
 ;=============================================================================
 
 %if HIGH_BIT_DEPTH
-%macro WEIGHT_START 1 ; (width)
+; width
+%macro WEIGHT_START 1
     mova        m0, [r4+ 0]         ; 1<<denom
     mova        m3, [r4+16]
     movd        m2, [r4+32]         ; denom
@@ -260,7 +261,8 @@ AVG_WEIGHT 16, 7
     paddw       m2, [sq_1]          ; denom+1
 %endmacro
 
-%macro WEIGHT 2 ; (src1, src2)
+; src1, src2
+%macro WEIGHT 2
     movh        m5, [%1]
     movh        m6, [%2]
     punpcklwd   m5, m0
@@ -272,7 +274,8 @@ AVG_WEIGHT 16, 7
     packssdw    m5, m6
 %endmacro
 
-%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 3
     %assign x 0
 %rep (%3+mmsize/2-1)/(mmsize/2)
 %if %3-x/2 <= 4 && mmsize == 16
@@ -297,109 +300,98 @@ AVG_WEIGHT 16, 7
 
 %macro WEIGHT_START 1
     mova     m3, [r4]
-    mova     m6, [r4+16]
+    mova     m4, [r4+16]
+%if notcpuflag(ssse3) || cpuflag(xop)
     movd     m5, [r4+32]
-    pxor     m2, m2
-%if (%1 == 20 || %1 == 12) && mmsize == 16
-    movdq2q mm3, xmm3
-    movdq2q mm4, xmm4
-    movdq2q mm5, xmm5
-    movdq2q mm6, xmm6
-    pxor    mm2, mm2
 %endif
-%endmacro
-
-%macro WEIGHT_START_SSSE3 1
-    mova     m3, [r4]
-    mova     m4, [r4+16]
     pxor     m2, m2
-%if %1 == 20 || %1 == 12
-    movdq2q mm3, xmm3
-    movdq2q mm4, xmm4
-    pxor    mm2, mm2
-%endif
 %endmacro
 
-;; macro to weight mmsize bytes taking half from %1 and half from %2
-%macro WEIGHT 2             ; (src1,src2)
-    movh      m0, [%1]
-    movh      m1, [%2]
-    punpcklbw m0, m2        ;setup
-    punpcklbw m1, m2        ;setup
-    pmullw    m0, m3        ;scale
-    pmullw    m1, m3        ;scale
-    paddsw    m0, m6        ;1<<(denom-1)+(offset<<denom)
-    paddsw    m1, m6        ;1<<(denom-1)+(offset<<denom)
-    psraw     m0, m5        ;denom
-    psraw     m1, m5        ;denom
+; src1, src2, dst1, dst2
+%macro WEIGHT_ROWx2 4
+    movh      m0, [%1         ]
+    movh      m1, [%1+mmsize/2]
+    movh      m6, [%2         ]
+    movh      m7, [%2+mmsize/2]
+    punpcklbw m0, m2
+    punpcklbw m1, m2
+    punpcklbw m6, m2
+    punpcklbw m7, m2
+%if cpuflag(ssse3)
+    psllw     m0, 7
+    psllw     m1, 7
+    psllw     m6, 7
+    psllw     m7, 7
+    pmulhrsw  m0, m3
+    pmulhrsw  m1, m3
+    pmulhrsw  m6, m3
+    pmulhrsw  m7, m3
+    paddw     m0, m4
+    paddw     m1, m4
+    paddw     m6, m4
+    paddw     m7, m4
+%else
+    pmullw    m0, m3
+    pmullw    m1, m3
+    pmullw    m6, m3
+    pmullw    m7, m3
+    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
+    paddsw    m1, m4
+    paddsw    m6, m4
+    paddsw    m7, m4
+    psraw     m0, m5
+    psraw     m1, m5
+    psraw     m6, m5
+    psraw     m7, m5
+%endif
+    packuswb  m0, m1
+    packuswb  m6, m7
+    mova    [%3], m0
+    mova    [%4], m6
 %endmacro
 
-%macro WEIGHT_SSSE3 2
+; src1, src2, dst1, dst2, width
+%macro WEIGHT_COL 5
     movh      m0, [%1]
     movh      m1, [%2]
     punpcklbw m0, m2
     punpcklbw m1, m2
+%if cpuflag(ssse3)
     psllw     m0, 7
     psllw     m1, 7
     pmulhrsw  m0, m3
     pmulhrsw  m1, m3
     paddw     m0, m4
     paddw     m1, m4
-%endmacro
-
-%macro WEIGHT_SAVE_ROW 3        ;(src,dst,width)
-%if %3 == 16
-    mova     [%2], %1
-%elif %3 == 8
-    movq     [%2], %1
-%else
-    movd     [%2], %1       ; width 2 can write garbage for last 2 bytes
-%endif
-%endmacro
-
-%macro WEIGHT_ROW 3         ; (src,dst,width)
-    ;; load weights
-    WEIGHT           %1, (%1+(mmsize/2))
-    packuswb         m0, m1        ;put bytes into m0
-    WEIGHT_SAVE_ROW  m0, %2, %3
-%endmacro
-
-%macro WEIGHT_SAVE_COL 2        ;(dst,size)
-%if %2 == 8
-    packuswb     m0, m1
-    movq       [%1], m0
-    movhps  [%1+r1], m0
 %else
-    packuswb     m0, m0
-    packuswb     m1, m1
-    movd       [%1], m0    ; width 2 can write garbage for last 2 bytes
-    movd    [%1+r1], m1
-%endif
-%endmacro
-
-%macro WEIGHT_COL 3     ; (src,dst,width)
-%if %3 <= 4 && mmsize == 16
-    INIT_MMX
-    ;; load weights
-    WEIGHT           %1, (%1+r3)
-    WEIGHT_SAVE_COL  %2, %3
-    INIT_XMM
+    pmullw    m0, m3
+    pmullw    m1, m3
+    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
+    paddsw    m1, m4
+    psraw     m0, m5
+    psraw     m1, m5
+%endif
+%if %5 == 8
+    packuswb  m0, m1
+    movh    [%3], m0
+    movhps  [%4], m0
 %else
-    WEIGHT           %1, (%1+r3)
-    WEIGHT_SAVE_COL  %2, %3
+    packuswb  m0, m0
+    packuswb  m1, m1
+    movd    [%3], m0    ; width 2 can write garbage for the last 2 bytes
+    movd    [%4], m1
 %endif
-
 %endmacro
 
-%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 3
 %assign x 0
 %rep %3
 %if (%3-x) >= mmsize
-    WEIGHT_ROW    (%1+x),    (%2+x), mmsize     ; weight 1 mmsize
-    WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize     ; weight 1 mmsize
+    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
     %assign x (x+mmsize)
 %else
-    WEIGHT_COL (%1+x),(%2+x),(%3-x)
+    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
     %exitrep
 %endif
 %if x >= %3
@@ -414,34 +406,15 @@ AVG_WEIGHT 16, 7
 ;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
 ;-----------------------------------------------------------------------------
 
-%if ARCH_X86_64
-%define NUMREGS 6
-%define LOAD_HEIGHT
-%define HEIGHT_REG r5d
-%define TMP_REG r6d
-%else
-%define NUMREGS 5
-%define TMP_REG r5d
-%define LOAD_HEIGHT mov r4d, r5m
-%define HEIGHT_REG r4d
-%endif
-
-%assign XMMREGS 7
-%if HIGH_BIT_DEPTH
-%assign NUMREGS NUMREGS+1
-%assign XMMREGS 8
-%endif
-
 %macro WEIGHTER 1
-    cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
+cglobal mc_weight_w%1, 6,6,8
     FIX_STRIDES r1, r3
     WEIGHT_START %1
-    LOAD_HEIGHT
 .loop:
     WEIGHT_TWO_ROW r2, r0, %1
     lea  r0, [r0+r1*2]
     lea  r2, [r2+r3*2]
-    sub HEIGHT_REG, 2
+    sub r5d, 2
     jg .loop
     REP_RET
 %endmacro
@@ -458,24 +431,13 @@ WEIGHTER 16
 WEIGHTER 20
 %if HIGH_BIT_DEPTH
 WEIGHTER 12
-INIT_XMM avx
-WEIGHTER  8
-WEIGHTER 12
-WEIGHTER 16
-WEIGHTER 20
 %else
-%define WEIGHT WEIGHT_SSSE3
-%define WEIGHT_START WEIGHT_START_SSSE3
 INIT_MMX ssse3
 WEIGHTER  4
 INIT_XMM ssse3
 WEIGHTER  8
 WEIGHTER 16
 WEIGHTER 20
-INIT_XMM avx
-WEIGHTER  8
-WEIGHTER 16
-WEIGHTER 20
 %endif
 
 %macro OFFSET_OP 7
@@ -520,7 +482,7 @@ WEIGHTER 20
 ;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
 ;-----------------------------------------------------------------------------
 %macro OFFSET 2
-    cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
+cglobal mc_offset%2_w%1, 6,6
     FIX_STRIDES r1, r3
     mova m2, [r4]
 %if HIGH_BIT_DEPTH
@@ -528,12 +490,11 @@ WEIGHTER 20
     mova m3, [pw_pixel_max]
 %endif
 %endif
-    LOAD_HEIGHT
 .loop:
     OFFSET_TWO_ROW r2, r0, %1, %2
     lea  r0, [r0+r1*2]
     lea  r2, [r2+r3*2]
-    sub HEIGHT_REG, 2
+    sub r5d, 2
     jg .loop
     REP_RET
 %endmacro
@@ -552,20 +513,10 @@ INIT_XMM sse2
 OFFSETPN 12
 OFFSETPN 16
 OFFSETPN 20
-INIT_XMM avx
-OFFSETPN 12
-OFFSETPN 16
-OFFSETPN 20
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
 OFFSETPN  8
-INIT_XMM avx
-OFFSETPN  8
 %endif
-%undef LOAD_HEIGHT
-%undef HEIGHT_REG
-%undef NUMREGS
-
 
 
 ;=============================================================================
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 5238556..e0680f5 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -72,11 +72,6 @@ MC_WEIGHT( 8, ssse3 )
 MC_WEIGHT( 12, ssse3 )
 MC_WEIGHT( 16, ssse3 )
 MC_WEIGHT( 20, ssse3 )
-MC_WEIGHT( 4, avx )
-MC_WEIGHT( 8, avx )
-MC_WEIGHT( 12, avx )
-MC_WEIGHT( 16, avx )
-MC_WEIGHT( 20, avx )
 #undef MC_OFFSET
 #undef MC_WEIGHT
 
@@ -745,6 +740,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
     pf->integral_init8h = x264_integral_init8h_avx;
     pf->hpel_filter = x264_hpel_filter_avx;
+
+    /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
+    pf->weight_cache = x264_weight_cache_ssse3;
+    pf->weight = x264_mc_weight_wtab_ssse3;
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 



More information about the x264-devel mailing list