[x264-devel] Clean up and optimize weightp, plus enable SSSE3 weight on SB/BDZ
Jason Garrett-Glaser
git at videolan.org
Sat Feb 4 21:10:52 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Jan 24 19:03:58 2012 -0800| [56ba096141d16ffcbabd805e2d27014f62f0d722] | committer: Jason Garrett-Glaser
Clean up and optimize weightp, plus enable SSSE3 weight on SB/BDZ
Also remove unused AVX cruft.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=56ba096141d16ffcbabd805e2d27014f62f0d722
---
common/x86/mc-a.asm | 199 +++++++++++++++++++--------------------------------
common/x86/mc-c.c | 9 +--
2 files changed, 79 insertions(+), 129 deletions(-)
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index dd3789d..3743b4d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -252,7 +252,8 @@ AVG_WEIGHT 16, 7
;=============================================================================
%if HIGH_BIT_DEPTH
-%macro WEIGHT_START 1 ; (width)
+; width
+%macro WEIGHT_START 1
mova m0, [r4+ 0] ; 1<<denom
mova m3, [r4+16]
movd m2, [r4+32] ; denom
@@ -260,7 +261,8 @@ AVG_WEIGHT 16, 7
paddw m2, [sq_1] ; denom+1
%endmacro
-%macro WEIGHT 2 ; (src1, src2)
+; src1, src2
+%macro WEIGHT 2
movh m5, [%1]
movh m6, [%2]
punpcklwd m5, m0
@@ -272,7 +274,8 @@ AVG_WEIGHT 16, 7
packssdw m5, m6
%endmacro
-%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 3
%assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
@@ -297,109 +300,98 @@ AVG_WEIGHT 16, 7
%macro WEIGHT_START 1
mova m3, [r4]
- mova m6, [r4+16]
+ mova m4, [r4+16]
+%if notcpuflag(ssse3) || cpuflag(xop)
movd m5, [r4+32]
- pxor m2, m2
-%if (%1 == 20 || %1 == 12) && mmsize == 16
- movdq2q mm3, xmm3
- movdq2q mm4, xmm4
- movdq2q mm5, xmm5
- movdq2q mm6, xmm6
- pxor mm2, mm2
%endif
-%endmacro
-
-%macro WEIGHT_START_SSSE3 1
- mova m3, [r4]
- mova m4, [r4+16]
pxor m2, m2
-%if %1 == 20 || %1 == 12
- movdq2q mm3, xmm3
- movdq2q mm4, xmm4
- pxor mm2, mm2
-%endif
%endmacro
-;; macro to weight mmsize bytes taking half from %1 and half from %2
-%macro WEIGHT 2 ; (src1,src2)
- movh m0, [%1]
- movh m1, [%2]
- punpcklbw m0, m2 ;setup
- punpcklbw m1, m2 ;setup
- pmullw m0, m3 ;scale
- pmullw m1, m3 ;scale
- paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
- paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
- psraw m0, m5 ;denom
- psraw m1, m5 ;denom
+; src1, src2, dst1, dst2
+%macro WEIGHT_ROWx2 4
+ movh m0, [%1 ]
+ movh m1, [%1+mmsize/2]
+ movh m6, [%2 ]
+ movh m7, [%2+mmsize/2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ punpcklbw m6, m2
+ punpcklbw m7, m2
+%if cpuflag(ssse3)
+ psllw m0, 7
+ psllw m1, 7
+ psllw m6, 7
+ psllw m7, 7
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ pmulhrsw m7, m3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m6, m4
+ paddw m7, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ pmullw m6, m3
+ pmullw m7, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ paddsw m6, m4
+ paddsw m7, m4
+ psraw m0, m5
+ psraw m1, m5
+ psraw m6, m5
+ psraw m7, m5
+%endif
+ packuswb m0, m1
+ packuswb m6, m7
+ mova [%3], m0
+ mova [%4], m6
%endmacro
-%macro WEIGHT_SSSE3 2
+; src1, src2, dst1, dst2, width
+%macro WEIGHT_COL 5
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2
punpcklbw m1, m2
+%if cpuflag(ssse3)
psllw m0, 7
psllw m1, 7
pmulhrsw m0, m3
pmulhrsw m1, m3
paddw m0, m4
paddw m1, m4
-%endmacro
-
-%macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
-%if %3 == 16
- mova [%2], %1
-%elif %3 == 8
- movq [%2], %1
-%else
- movd [%2], %1 ; width 2 can write garbage for last 2 bytes
-%endif
-%endmacro
-
-%macro WEIGHT_ROW 3 ; (src,dst,width)
- ;; load weights
- WEIGHT %1, (%1+(mmsize/2))
- packuswb m0, m1 ;put bytes into m0
- WEIGHT_SAVE_ROW m0, %2, %3
-%endmacro
-
-%macro WEIGHT_SAVE_COL 2 ;(dst,size)
-%if %2 == 8
- packuswb m0, m1
- movq [%1], m0
- movhps [%1+r1], m0
%else
- packuswb m0, m0
- packuswb m1, m1
- movd [%1], m0 ; width 2 can write garbage for last 2 bytes
- movd [%1+r1], m1
-%endif
-%endmacro
-
-%macro WEIGHT_COL 3 ; (src,dst,width)
-%if %3 <= 4 && mmsize == 16
- INIT_MMX
- ;; load weights
- WEIGHT %1, (%1+r3)
- WEIGHT_SAVE_COL %2, %3
- INIT_XMM
+ pmullw m0, m3
+ pmullw m1, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ psraw m0, m5
+ psraw m1, m5
+%endif
+%if %5 == 8
+ packuswb m0, m1
+ movh [%3], m0
+ movhps [%4], m0
%else
- WEIGHT %1, (%1+r3)
- WEIGHT_SAVE_COL %2, %3
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
+ movd [%4], m1
%endif
-
%endmacro
-%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 3
%assign x 0
%rep %3
%if (%3-x) >= mmsize
- WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
- WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
%assign x (x+mmsize)
%else
- WEIGHT_COL (%1+x),(%2+x),(%3-x)
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
%exitrep
%endif
%if x >= %3
@@ -414,34 +406,15 @@ AVG_WEIGHT 16, 7
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
-%if ARCH_X86_64
-%define NUMREGS 6
-%define LOAD_HEIGHT
-%define HEIGHT_REG r5d
-%define TMP_REG r6d
-%else
-%define NUMREGS 5
-%define TMP_REG r5d
-%define LOAD_HEIGHT mov r4d, r5m
-%define HEIGHT_REG r4d
-%endif
-
-%assign XMMREGS 7
-%if HIGH_BIT_DEPTH
-%assign NUMREGS NUMREGS+1
-%assign XMMREGS 8
-%endif
-
%macro WEIGHTER 1
- cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
+cglobal mc_weight_w%1, 6,6,8
FIX_STRIDES r1, r3
WEIGHT_START %1
- LOAD_HEIGHT
.loop:
WEIGHT_TWO_ROW r2, r0, %1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub HEIGHT_REG, 2
+ sub r5d, 2
jg .loop
REP_RET
%endmacro
@@ -458,24 +431,13 @@ WEIGHTER 16
WEIGHTER 20
%if HIGH_BIT_DEPTH
WEIGHTER 12
-INIT_XMM avx
-WEIGHTER 8
-WEIGHTER 12
-WEIGHTER 16
-WEIGHTER 20
%else
-%define WEIGHT WEIGHT_SSSE3
-%define WEIGHT_START WEIGHT_START_SSSE3
INIT_MMX ssse3
WEIGHTER 4
INIT_XMM ssse3
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
-INIT_XMM avx
-WEIGHTER 8
-WEIGHTER 16
-WEIGHTER 20
%endif
%macro OFFSET_OP 7
@@ -520,7 +482,7 @@ WEIGHTER 20
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 2
- cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
+cglobal mc_offset%2_w%1, 6,6
FIX_STRIDES r1, r3
mova m2, [r4]
%if HIGH_BIT_DEPTH
@@ -528,12 +490,11 @@ WEIGHTER 20
mova m3, [pw_pixel_max]
%endif
%endif
- LOAD_HEIGHT
.loop:
OFFSET_TWO_ROW r2, r0, %1, %2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub HEIGHT_REG, 2
+ sub r5d, 2
jg .loop
REP_RET
%endmacro
@@ -552,20 +513,10 @@ INIT_XMM sse2
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
-INIT_XMM avx
-OFFSETPN 12
-OFFSETPN 16
-OFFSETPN 20
%if HIGH_BIT_DEPTH
INIT_XMM sse2
OFFSETPN 8
-INIT_XMM avx
-OFFSETPN 8
%endif
-%undef LOAD_HEIGHT
-%undef HEIGHT_REG
-%undef NUMREGS
-
;=============================================================================
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 5238556..e0680f5 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -72,11 +72,6 @@ MC_WEIGHT( 8, ssse3 )
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
-MC_WEIGHT( 4, avx )
-MC_WEIGHT( 8, avx )
-MC_WEIGHT( 12, avx )
-MC_WEIGHT( 16, avx )
-MC_WEIGHT( 20, avx )
#undef MC_OFFSET
#undef MC_WEIGHT
@@ -745,6 +740,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
pf->integral_init8h = x264_integral_init8h_avx;
pf->hpel_filter = x264_hpel_filter_avx;
+
+ /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
+ pf->weight_cache = x264_weight_cache_ssse3;
+ pf->weight = x264_mc_weight_wtab_ssse3;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
More information about the x264-devel
mailing list