[x264-devel] commit: MMX/SSE2 high bit depth weight_cache/offset(sub|add) functions ( Daniel Kang )
git at videolan.org
git at videolan.org
Wed Dec 15 04:19:35 CET 2010
x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Mon Dec 13 17:15:12 2010 -0500| [b5e236636a26cf40568ccc44fefcc6aef6d0f0ea] | committer: Jason Garrett-Glaser
MMX/SSE2 high bit depth weight_cache/offset(sub|add) functions
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b5e236636a26cf40568ccc44fefcc6aef6d0f0ea
---
common/x86/mc-a.asm | 44 +++++++++++++++++++++++++++++---------------
common/x86/mc-c.c | 45 ++++++++++++++++++++++++++++++++++++++-------
2 files changed, 67 insertions(+), 22 deletions(-)
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 22fb872..3ef4385 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -188,18 +188,10 @@ AVG_WEIGHT ssse3, 16, 7
%ifdef HIGH_BIT_DEPTH
%macro WEIGHT_START 1 ; (width)
+ mova m0, [r4+ 0] ; 1<<denom
+ mova m3, [r4+16]
movd m2, [r4+32] ; denom
- movd m3, [r4+36] ; scale
- mov TMP_REG, [r4+40] ; offset
- mova m0, [pw_1]
- shl TMP_REG, BIT_DEPTH-7
mova m4, [pw_pixel_max]
- add TMP_REG, 1
- psllw m0, m2 ; 1<<denom
- movd m1, TMP_REG ; 1+(offset<<(BIT_DEPTH-8+1))
- psllw m3, 1 ; scale<<1
- punpcklwd m3, m1
- SPLATD m3, m3
paddw m2, [sq_1] ; denom+1
%endmacro
@@ -354,7 +346,7 @@ AVG_WEIGHT ssse3, 16, 7
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
@@ -415,8 +407,17 @@ WEIGHTER 20, ssse3
%macro OFFSET_OP 7
mov%6 m0, [%1]
mov%6 m1, [%2]
+%ifdef HIGH_BIT_DEPTH
+ p%5usw m0, m2
+ p%5usw m1, m2
+%ifidn %5,add
+ pminsw m0, m3
+ pminsw m1, m3
+%endif
+%else
p%5usb m0, m2
p%5usb m1, m2
+%endif
mov%7 [%3], m0
mov%7 [%4], m1
%endmacro
@@ -424,25 +425,35 @@ WEIGHTER 20, ssse3
%macro OFFSET_TWO_ROW 4
%assign x 0
%rep %3
-%if (%3-x) >= mmsize
+%if (%3*SIZEOF_PIXEL-x) >= mmsize
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
%assign x (x+mmsize)
%else
- OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%ifdef HIGH_BIT_DEPTH
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
+%else
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%endif
%exitrep
%endif
-%if x >= %3
+%if x >= %3*SIZEOF_PIXEL
%exitrep
%endif
%endrep
%endmacro
;-----------------------------------------------------------------------------
-;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 3
cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+ FIX_STRIDES r1, r3
mova m2, [r4]
+%ifdef HIGH_BIT_DEPTH
+%ifidn %3,add
+ mova m3, [pw_pixel_max]
+%endif
+%endif
LOAD_HEIGHT
.loop:
OFFSET_TWO_ROW r2, r0, %1, %3
@@ -467,6 +478,9 @@ INIT_XMM
OFFSETPN 12, sse2
OFFSETPN 16, sse2
OFFSETPN 20, sse2
+%ifdef HIGH_BIT_DEPTH
+OFFSETPN 8, sse2
+%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
%undef NUMREGS
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index e95daeb..3d6e91c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -50,8 +50,8 @@ DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int
void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
#define MC_WEIGHT_OFFSET(w,type) \
- void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
- void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
MC_WEIGHT(w,type)
MC_WEIGHT_OFFSET( 4, mmxext )
@@ -62,6 +62,9 @@ MC_WEIGHT_OFFSET( 20, mmxext )
MC_WEIGHT_OFFSET( 12, sse2 )
MC_WEIGHT_OFFSET( 16, sse2 )
MC_WEIGHT_OFFSET( 20, sse2 )
+#if HIGH_BIT_DEPTH
+MC_WEIGHT_OFFSET( 8, sse2 )
+#endif
MC_WEIGHT( 8, sse2 )
MC_WEIGHT( 4, ssse3 )
MC_WEIGHT( 8, ssse3 )
@@ -220,7 +223,34 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#if HIGH_BIT_DEPTH
MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,sse2,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,sse2,16)
+
+static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+{
+ if( w->i_scale == 1<<w->i_denom )
+ {
+ if( w->i_offset < 0 )
+ w->weightfn = h->mc.offsetsub;
+ else
+ w->weightfn = h->mc.offsetadd;
+ for( int i = 0; i < 8; i++ )
+ w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
+ return;
+ }
+ w->weightfn = h->mc.weight;
+ int den1 = 1<<w->i_denom;
+ int den2 = w->i_scale<<1;
+ int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
+ for( int i = 0; i < 8; i++ )
+ {
+ w->cachea[i] = den1;
+ w->cacheb[i] = i&1 ? den3 : den2;
+ }
+}
#else
MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
@@ -268,7 +298,7 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
}
w->weightfn = h->mc.weight;
den1 = w->i_scale << (8 - w->i_denom);
- for(i = 0;i<8;i++)
+ for( i = 0; i < 8; i++ )
{
w->cachea[i] = den1;
w->cacheb[i] = w->i_offset;
@@ -458,6 +488,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_chroma = x264_mc_chroma_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->weight = x264_mc_weight_wtab_mmxext;
+ pf->weight_cache = x264_weight_cache_mmxext;
+ pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
+ pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
@@ -476,6 +509,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
+ pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+ pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
@@ -492,10 +527,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
#else // !HIGH_BIT_DEPTH
- pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
- pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
- pf->weight_cache = x264_weight_cache_mmxext;
-
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
More information about the x264-devel
mailing list