[x264-devel] commit: MMX/SSE2/SSSE3 high bit depth frame_init_lowres functions ( Daniel Kang )

git at videolan.org git at videolan.org
Mon Jan 10 22:01:01 CET 2011


x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Wed Dec 22 17:53:08 2010 -0500| [2b8f9731b7cb01f0c93df27e7fcc4f1e9fa75c0b] | committer: Jason Garrett-Glaser 

MMX/SSE2/SSSE3 high bit depth frame_init_lowres functions

Patch from Google Code-In.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2b8f9731b7cb01f0c93df27e7fcc4f1e9fa75c0b
---

 common/x86/mc-a2.asm |   88 +++++++++++++++++++++++++++++++++++++++++++++++---
 common/x86/mc-c.c    |   15 +++++++--
 2 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 1b75dfe..8cb83f6 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1447,14 +1447,64 @@ cglobal integral_init4v_ssse3, 3,5
     mova    [%2], m2
 %endmacro
 
+%macro FILT8xU 3
+    mova      m3, [r0+%3+8]
+    mova      m2, [r0+%3]
+    pavgw     m3, [r0+%3+r5+8]
+    pavgw     m2, [r0+%3+r5]
+    movu      m1, [r0+%3+10]
+    movu      m0, [r0+%3+2]
+    pavgw     m1, [r0+%3+r5+10]
+    pavgw     m0, [r0+%3+r5+2]
+    pavgw     m1, m3
+    pavgw     m0, m2
+    mova      m3, m1
+    mova      m2, m0
+    pand      m1, m7
+    pand      m0, m7
+    psrld     m3, 16
+    psrld     m2, 16
+    packssdw  m0, m1
+    packssdw  m2, m3
+    movu    [%1], m0
+    mova    [%2], m2
+%endmacro
+
+%macro FILT8xA 4
+    mova      m3, [r0+%4+mmsize]
+    mova      m2, [r0+%4]
+    pavgw     m3, [r0+%4+r5+mmsize]
+    pavgw     m2, [r0+%4+r5]
+    PALIGNR   %1, m3, 2, m6
+    pavgw     %1, m3
+    PALIGNR   m3, m2, 2, m6
+    pavgw     m3, m2
+    mova      m5, m3
+    mova      m4, %1
+    pand      m3, m7
+    pand      %1, m7
+    psrld     m5, 16
+    psrld     m4, 16
+    packssdw  m3, %1
+    packssdw  m5, m4
+    mova    [%2], m3
+    mova    [%3], m5
+    mova      %1, m2
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
 ;                              int src_stride, int dst_stride, int width, int height )
 ;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal frame_init_lowres_core_%1, 6,7,%2
+%macro FRAME_INIT_LOWRES 1
+cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%ifdef HIGH_BIT_DEPTH
+    shl   dword r6m, 1
+    FIX_STRIDES r5d
+    shl   dword r7m, 1
+%endif
 %ifdef WIN64
-    movsxd   r5, r5d
+    movsxd    r5, r5d
 %endif
     ; src += 2*(height-1)*stride + 2*width
     mov      r6d, r8m
@@ -1481,6 +1531,33 @@ cglobal frame_init_lowres_core_%1, 6,7,%2
     shl      r6d, 1
     PUSH      r6
     %define src_gap [rsp]
+%ifdef HIGH_BIT_DEPTH
+    pcmpeqw   m7, m7
+    psrld     m7, 16
+.vloop:
+    mov      r6d, r7m
+%ifnidn %1,mmxext
+    mova      m0, [r0]
+    mova      m1, [r0+r5]
+    pavgw     m0, m1
+    pavgw     m1, [r0+r5*2]
+%endif
+.hloop:
+    sub       r0, mmsize*2
+    sub       r1, mmsize
+    sub       r2, mmsize
+    sub       r3, mmsize
+    sub       r4, mmsize
+%ifidn %1,mmxext
+    FILT8xU r1, r2, 0
+    FILT8xU r3, r4, r5
+%else
+    FILT8xA m0, r1, r2, 0
+    FILT8xA m1, r3, r4, r5
+%endif
+    sub      r6d, mmsize
+    jg .hloop
+%else ; !HIGH_BIT_DEPTH
 %if mmsize == 16
     ; adjust for the odd end case
     mov      r6d, r7m
@@ -1544,6 +1621,7 @@ cglobal frame_init_lowres_core_%1, 6,7,%2
 %endif
     sub      r6d, mmsize
     jg .hloop
+%endif ; HIGH_BIT_DEPTH
 .skip:
     mov       r6, dst_gap
     sub       r0, src_gap
@@ -1565,9 +1643,9 @@ FRAME_INIT_LOWRES mmxext
 FRAME_INIT_LOWRES cache32_mmxext
 %endif
 INIT_XMM
-FRAME_INIT_LOWRES sse2, 12
+FRAME_INIT_LOWRES sse2
 %define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3, 12
+FRAME_INIT_LOWRES ssse3
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 1b135cc..5477646 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -137,7 +137,7 @@ MC_CHROMA(ssse3)
 MC_CHROMA(ssse3_cache64)
 
 #define LOWRES(cpu)\
-void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
                                         int src_stride, int dst_stride, int width, int height );
 LOWRES(mmxext)
 LOWRES(cache32_mmxext)
@@ -510,10 +510,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
     pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
 
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
+
 #if HIGH_BIT_DEPTH
+#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
+    if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
+#endif
+
     if( !(cpu&X264_CPU_SSE2) )
         return;
 
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
+
     pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
     pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
 
@@ -557,11 +566,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_SSSE3) )
         return;
 
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+
     if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 #else // !HIGH_BIT_DEPTH
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
-
     pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
     pf->prefetch_ref  = x264_prefetch_ref_mmxext;
 



More information about the x264-devel mailing list