[x264-devel] commit: MMX/SSE2/SSSE3 high bit depth frame_init_lowres functions ( Daniel Kang )
git at videolan.org
git at videolan.org
Mon Jan 10 22:01:01 CET 2011
x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Wed Dec 22 17:53:08 2010 -0500| [2b8f9731b7cb01f0c93df27e7fcc4f1e9fa75c0b] | committer: Jason Garrett-Glaser
MMX/SSE2/SSSE3 high bit depth frame_init_lowres functions
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2b8f9731b7cb01f0c93df27e7fcc4f1e9fa75c0b
---
common/x86/mc-a2.asm | 88 +++++++++++++++++++++++++++++++++++++++++++++++---
common/x86/mc-c.c | 15 +++++++--
2 files changed, 95 insertions(+), 8 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 1b75dfe..8cb83f6 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1447,14 +1447,64 @@ cglobal integral_init4v_ssse3, 3,5
mova [%2], m2
%endmacro
+%macro FILT8xU 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgw m3, [r0+%3+r5+8]
+ pavgw m2, [r0+%3+r5]
+ movu m1, [r0+%3+10]
+ movu m0, [r0+%3+2]
+ pavgw m1, [r0+%3+r5+10]
+ pavgw m0, [r0+%3+r5+2]
+ pavgw m1, m3
+ pavgw m0, m2
+ mova m3, m1
+ mova m2, m0
+ pand m1, m7
+ pand m0, m7
+ psrld m3, 16
+ psrld m2, 16
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [%1], m0
+ mova [%2], m2
+%endmacro
+
+%macro FILT8xA 4
+ mova m3, [r0+%4+mmsize]
+ mova m2, [r0+%4]
+ pavgw m3, [r0+%4+r5+mmsize]
+ pavgw m2, [r0+%4+r5]
+ PALIGNR %1, m3, 2, m6
+ pavgw %1, m3
+ PALIGNR m3, m2, 2, m6
+ pavgw m3, m2
+ mova m5, m3
+ mova m4, %1
+ pand m3, m7
+ pand %1, m7
+ psrld m5, 16
+ psrld m4, 16
+ packssdw m3, %1
+ packssdw m5, m4
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal frame_init_lowres_core_%1, 6,7,%2
+%macro FRAME_INIT_LOWRES 1
+cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%ifdef HIGH_BIT_DEPTH
+ shl dword r6m, 1
+ FIX_STRIDES r5d
+ shl dword r7m, 1
+%endif
%ifdef WIN64
- movsxd r5, r5d
+ movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
@@ -1481,6 +1531,33 @@ cglobal frame_init_lowres_core_%1, 6,7,%2
shl r6d, 1
PUSH r6
%define src_gap [rsp]
+%ifdef HIGH_BIT_DEPTH
+ pcmpeqw m7, m7
+ psrld m7, 16
+.vloop:
+ mov r6d, r7m
+%ifnidn %1,mmxext
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgw m0, m1
+ pavgw m1, [r0+r5*2]
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+ sub r2, mmsize
+ sub r3, mmsize
+ sub r4, mmsize
+%ifidn %1,mmxext
+ FILT8xU r1, r2, 0
+ FILT8xU r3, r4, r5
+%else
+ FILT8xA m0, r1, r2, 0
+ FILT8xA m1, r3, r4, r5
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%else ; !HIGH_BIT_DEPTH
%if mmsize == 16
; adjust for the odd end case
mov r6d, r7m
@@ -1544,6 +1621,7 @@ cglobal frame_init_lowres_core_%1, 6,7,%2
%endif
sub r6d, mmsize
jg .hloop
+%endif ; HIGH_BIT_DEPTH
.skip:
mov r6, dst_gap
sub r0, src_gap
@@ -1565,9 +1643,9 @@ FRAME_INIT_LOWRES mmxext
FRAME_INIT_LOWRES cache32_mmxext
%endif
INIT_XMM
-FRAME_INIT_LOWRES sse2, 12
+FRAME_INIT_LOWRES sse2
%define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3, 12
+FRAME_INIT_LOWRES ssse3
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 1b135cc..5477646 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -137,7 +137,7 @@ MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
#define LOWRES(cpu)\
-void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
int src_stride, int dst_stride, int width, int height );
LOWRES(mmxext)
LOWRES(cache32_mmxext)
@@ -510,10 +510,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
+
#if HIGH_BIT_DEPTH
+#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
+ if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
+#endif
+
if( !(cpu&X264_CPU_SSE2) )
return;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
+
pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
@@ -557,11 +566,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_SSSE3) )
return;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
#else // !HIGH_BIT_DEPTH
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
-
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
More information about the x264-devel
mailing list