[x264-devel] commit: SSSE3 cachesplit workaround for avg2_w16 (Jason Garrett-Glaser )
git version control
git at videolan.org
Sun Jul 26 10:46:44 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Jul 21 19:56:21 2009 -0700| [f5e6980b3eb34ed610f5fc36a4378a0ed4277753] | committer: Jason Garrett-Glaser
SSSE3 cachesplit workaround for avg2_w16
Palignr-based solution for the most commonly used qpel function.
1-1.5% faster overall on Core 2 chips.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f5e6980b3eb34ed610f5fc36a4378a0ed4277753
---
common/x86/mc-a.asm | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/mc-c.c | 8 ++++++
2 files changed, 68 insertions(+), 0 deletions(-)
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 206bd35..3e9df66 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -511,6 +511,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %2&15==0
+ movdqa xmm1, [r2+16]
+ palignr xmm1, [r2], %1
+ pavgb xmm1, [r2+r4]
+%else
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r4+16]
+ palignr xmm1, [r2], %1
+ palignr xmm2, [r2+r4], %2
+ pavgb xmm1, xmm2
+%endif
+ movdqa [r0], xmm1
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg avg_w16_align%1_%2_ssse3
+ rep ret
+%endmacro
+
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+
+cglobal x264_pixel_avg2_w16_cache64_ssse3
+ mov eax, r2m
+ and eax, 0x3f
+ cmp eax, 0x30
+ jle x264_pixel_avg2_w16_sse2
+ PROLOGUE 6,7
+ lea r6, [r4+r2]
+ and r4, ~0xf
+ and r6, 0x1f
+ and r2, ~0xf
+ lea r6, [r6*3] ;(offset + align*2)*3
+ sub r4, r2
+ shl r6, 4 ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+ lea r11, [avg_w16_addr GLOBAL]
+ add r6, r11
+%else
+ lea r6, [avg_w16_addr + r6 GLOBAL]
+%endif
+%ifdef UNIX64
+ jmp r6
+%else
+ call r6
+ RET
+%endif
+
+
;=============================================================================
; pixel copy
;=============================================================================
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index dcf623a..f69b99c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -94,6 +94,7 @@ PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
+PIXEL_AVG_WALL(cache64_ssse3)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -119,6 +120,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -166,6 +168,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
#define GET_REF(name)\
static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
@@ -199,6 +202,7 @@ GET_REF(cache64_mmxext)
GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
+GET_REF(cache64_ssse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -344,7 +348,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
+ {
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+ pf->mc_luma = mc_luma_cache64_ssse3;
+ pf->get_ref = get_ref_cache64_ssse3;
+ }
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;
More information about the x264-devel
mailing list