[x264-devel] commit: MMX/SSE2 high bit depth weight_cache/offset(sub|add) functions ( Daniel Kang )

Wed Dec 15 04:19:35 CET 2010

x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Mon Dec 13 17:15:12 2010 -0500| [b5e236636a26cf40568ccc44fefcc6aef6d0f0ea] | committer: Jason Garrett-Glaser 

MMX/SSE2 high bit depth weight_cache/offset(sub|add) functions

Patch from Google Code-In.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b5e236636a26cf40568ccc44fefcc6aef6d0f0ea
---

 common/x86/mc-a.asm |   44 +++++++++++++++++++++++++++++---------------
 common/x86/mc-c.c   |   45 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 22fb872..3ef4385 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -188,18 +188,10 @@ AVG_WEIGHT ssse3, 16, 7
 
 %ifdef HIGH_BIT_DEPTH
 %macro WEIGHT_START 1 ; (width)
+    mova        m0, [r4+ 0]         ; 1<<denom
+    mova        m3, [r4+16]
     movd        m2, [r4+32]         ; denom
-    movd        m3, [r4+36]         ; scale
-    mov    TMP_REG, [r4+40]         ; offset
-    mova        m0, [pw_1]
-    shl    TMP_REG, BIT_DEPTH-7
     mova        m4, [pw_pixel_max]
-    add    TMP_REG, 1
-    psllw       m0, m2              ; 1<<denom
-    movd        m1, TMP_REG         ; 1+(offset<<(BIT_DEPTH-8+1))
-    psllw       m3, 1               ; scale<<1
-    punpcklwd   m3, m1
-    SPLATD      m3, m3
     paddw       m2, [sq_1]          ; denom+1
 %endmacro
 
@@ -354,7 +346,7 @@ AVG_WEIGHT ssse3, 16, 7
 %endif ; HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
 ;-----------------------------------------------------------------------------
 
 %ifdef ARCH_X86_64
@@ -415,8 +407,17 @@ WEIGHTER 20, ssse3
 %macro OFFSET_OP 7
     mov%6        m0, [%1]
     mov%6        m1, [%2]
+%ifdef HIGH_BIT_DEPTH
+    p%5usw       m0, m2
+    p%5usw       m1, m2
+%ifidn %5,add
+    pminsw       m0, m3
+    pminsw       m1, m3
+%endif
+%else
     p%5usb       m0, m2
     p%5usb       m1, m2
+%endif
     mov%7      [%3], m0
     mov%7      [%4], m1
 %endmacro
@@ -424,25 +425,35 @@ WEIGHTER 20, ssse3
 %macro OFFSET_TWO_ROW 4
 %assign x 0
 %rep %3
-%if (%3-x) >= mmsize
+%if (%3*SIZEOF_PIXEL-x) >= mmsize
     OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
     %assign x (x+mmsize)
 %else
-    OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%ifdef HIGH_BIT_DEPTH
+    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
+%else
+    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%endif
     %exitrep
 %endif
-%if x >= %3
+%if x >= %3*SIZEOF_PIXEL
     %exitrep
 %endif
 %endrep
 %endmacro
 
 ;-----------------------------------------------------------------------------
-;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
 ;-----------------------------------------------------------------------------
 %macro OFFSET 3
     cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+    FIX_STRIDES r1, r3
     mova m2, [r4]
+%ifdef HIGH_BIT_DEPTH
+%ifidn %3,add
+    mova m3, [pw_pixel_max]
+%endif
+%endif
     LOAD_HEIGHT
 .loop:
     OFFSET_TWO_ROW r2, r0, %1, %3
@@ -467,6 +478,9 @@ INIT_XMM
 OFFSETPN 12, sse2
 OFFSETPN 16, sse2
 OFFSETPN 20, sse2
+%ifdef HIGH_BIT_DEPTH
+OFFSETPN  8, sse2
+%endif
 %undef LOAD_HEIGHT
 %undef HEIGHT_REG
 %undef NUMREGS
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index e95daeb..3d6e91c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -50,8 +50,8 @@ DECL_SUF( x264_pixel_avg_4x2,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int
     void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
 
 #define MC_WEIGHT_OFFSET(w,type) \
-    void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
-    void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
+    void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
     MC_WEIGHT(w,type)
 
 MC_WEIGHT_OFFSET( 4, mmxext )
@@ -62,6 +62,9 @@ MC_WEIGHT_OFFSET( 20, mmxext )
 MC_WEIGHT_OFFSET( 12, sse2 )
 MC_WEIGHT_OFFSET( 16, sse2 )
 MC_WEIGHT_OFFSET( 20, sse2 )
+#if HIGH_BIT_DEPTH
+MC_WEIGHT_OFFSET( 8, sse2 )
+#endif
 MC_WEIGHT( 8, sse2  )
 MC_WEIGHT( 4, ssse3 )
 MC_WEIGHT( 8, ssse3 )
@@ -220,7 +223,34 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
 
 #if HIGH_BIT_DEPTH
 MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
 MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,sse2,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,sse2,16)
+
+static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+{
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+            w->weightfn = h->mc.offsetsub;
+        else
+            w->weightfn = h->mc.offsetadd;
+        for( int i = 0; i < 8; i++ )
+            w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
+        return;
+    }
+    w->weightfn = h->mc.weight;
+    int den1 = 1<<w->i_denom;
+    int den2 = w->i_scale<<1;
+    int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
+    for( int i = 0; i < 8; i++ )
+    {
+        w->cachea[i] = den1;
+        w->cacheb[i] = i&1 ? den3 : den2;
+    }
+}
 #else
 MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
 MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
@@ -268,7 +298,7 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
     }
     w->weightfn = h->mc.weight;
     den1 = w->i_scale << (8 - w->i_denom);
-    for(i = 0;i<8;i++)
+    for( i = 0; i < 8; i++ )
     {
         w->cachea[i] = den1;
         w->cacheb[i] = w->i_offset;
@@ -458,6 +488,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->mc_chroma = x264_mc_chroma_mmxext;
     pf->hpel_filter = x264_hpel_filter_mmxext;
     pf->weight = x264_mc_weight_wtab_mmxext;
+    pf->weight_cache = x264_weight_cache_mmxext;
+    pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
+    pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
 
 #if HIGH_BIT_DEPTH
     if( !(cpu&X264_CPU_SSE2) )
@@ -476,6 +509,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
     pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
+    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
 
     if( cpu&X264_CPU_SSE2_IS_SLOW )
         return;
@@ -492,10 +527,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 #else // !HIGH_BIT_DEPTH
-    pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
-    pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
-    pf->weight_cache = x264_weight_cache_mmxext;
-
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
     pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmxext;