[x264-devel] commit: SSSE3, faster SSE2/MMX integral_init4v (Jason Garrett-Glaser )
git version control
git at videolan.org
Sat Jun 20 01:28:59 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Jun 10 10:11:00 2009 -0700| [b555e3f90b2060542d44bcb1a254d5a7bfc5d23a] | committer: Jason Garrett-Glaser
SSSE3, faster SSE2/MMX integral_init4v
The real reason I wrote this was an excuse to use shufpd.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b555e3f90b2060542d44bcb1a254d5a7bfc5d23a
---
common/x86/mc-a2.asm | 105 ++++++++++++++++++++++++++++++++++++-------------
common/x86/mc-c.c | 4 ++
2 files changed, 81 insertions(+), 28 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 1148910..9745ac6 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -796,32 +796,7 @@ cglobal x264_integral_init8h_sse4, 3,4
jl .loop
REP_RET
-%macro INTEGRAL_INIT 1
-;-----------------------------------------------------------------------------
-; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
-;-----------------------------------------------------------------------------
-cglobal x264_integral_init4v_%1, 3,5
- shl r2, 1
- add r0, r2
- add r1, r2
- lea r3, [r0+r2*4]
- lea r4, [r0+r2*8]
- neg r2
-.loop:
- movu m0, [r0+r2+8]
- mova m2, [r0+r2]
- movu m1, [r4+r2+8]
- paddw m0, m2
- paddw m1, [r4+r2]
- mova m3, [r3+r2]
- psubw m1, m0
- psubw m3, m2
- mova [r0+r2], m1
- mova [r1+r2], m3
- add r2, mmsize
- jl .loop
- REP_RET
-
+%macro INTEGRAL_INIT_8V 1
;-----------------------------------------------------------------------------
; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
@@ -843,11 +818,85 @@ cglobal x264_integral_init8v_%1, 3,3
%endmacro
INIT_MMX
-INTEGRAL_INIT mmx
+INTEGRAL_INIT_8V mmx
INIT_XMM
-INTEGRAL_INIT sse2
+INTEGRAL_INIT_8V sse2
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_integral_init4v_mmx, 3,5
+ shl r2, 1
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ mova m0, [r0+r2]
+ mova m4, [r4+r2]
+.loop:
+ sub r2, 8
+ mova m1, m4
+ psubw m1, m0
+ mova m4, [r4+r2]
+ mova m0, [r0+r2]
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m0
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ jge .loop
+ REP_RET
+INIT_XMM
+cglobal x264_integral_init4v_sse2, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m0, [r0+r2]
+ mova m1, [r4+r2]
+ mova m2, m0
+ mova m4, m1
+ shufpd m0, [r0+r2+16], 1
+ shufpd m1, [r4+r2+16], 1
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
+cglobal x264_integral_init4v_ssse3, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m2, [r0+r2]
+ mova m0, [r0+r2+16]
+ mova m4, [r4+r2]
+ mova m1, [r4+r2+16]
+ palignr m0, m2, 8
+ palignr m1, m4, 8
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
%macro FILT8x4 7
mova %3, [r0+%7]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index c66612d..aede5b8 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -70,6 +70,7 @@ extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride
extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
+extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
@@ -340,6 +341,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->integral_init4v = x264_integral_init4v_ssse3;
+
if( !(cpu&X264_CPU_SSE4) )
return;
More information about the x264-devel
mailing list