[x264-devel] commit: SSSE3 cachesplit workaround for avg2_w16 (Jason Garrett-Glaser )

Sun Jul 26 10:46:44 CEST 2009

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Jul 21 19:56:21 2009 -0700| [f5e6980b3eb34ed610f5fc36a4378a0ed4277753] | committer: Jason Garrett-Glaser 

SSSE3 cachesplit workaround for avg2_w16
Palignr-based solution for the most commonly used qpel function.
1-1.5% faster overall on Core 2 chips.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f5e6980b3eb34ed610f5fc36a4378a0ed4277753
---

 common/x86/mc-a.asm |   60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/mc-c.c   |    8 ++++++
 2 files changed, 68 insertions(+), 0 deletions(-)

diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 206bd35..3e9df66 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -511,6 +511,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext
 AVG_CACHELINE_CHECK 16, 64, sse2
 AVG_CACHELINE_CHECK 20, 64, sse2
 
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %2&15==0
+    movdqa  xmm1, [r2+16]
+    palignr xmm1, [r2], %1
+    pavgb   xmm1, [r2+r4]
+%else
+    movdqa  xmm1, [r2+16]
+    movdqa  xmm2, [r2+r4+16]
+    palignr xmm1, [r2], %1
+    palignr xmm2, [r2+r4], %2
+    pavgb   xmm1, xmm2
+%endif
+    movdqa  [r0], xmm1
+    add    r2, r3
+    add    r0, r1
+    dec    r5d
+    jg     avg_w16_align%1_%2_ssse3
+    rep ret
+%endmacro
+
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+
+cglobal x264_pixel_avg2_w16_cache64_ssse3
+    mov    eax, r2m
+    and    eax, 0x3f
+    cmp    eax, 0x30
+    jle x264_pixel_avg2_w16_sse2
+    PROLOGUE 6,7
+    lea    r6, [r4+r2]
+    and    r4, ~0xf
+    and    r6, 0x1f
+    and    r2, ~0xf
+    lea    r6, [r6*3]    ;(offset + align*2)*3
+    sub    r4, r2
+    shl    r6, 4         ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+    lea   r11, [avg_w16_addr GLOBAL]
+    add    r6, r11
+%else
+    lea    r6, [avg_w16_addr + r6 GLOBAL]
+%endif
+%ifdef UNIX64
+    jmp    r6
+%else
+    call   r6
+    RET
+%endif
+
+
 ;=============================================================================
 ; pixel copy
 ;=============================================================================
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index dcf623a..f69b99c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -94,6 +94,7 @@ PIXEL_AVG_WALL(cache64_mmxext)
 PIXEL_AVG_WALL(cache64_sse2)
 PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
+PIXEL_AVG_WALL(cache64_ssse3)
 
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -119,6 +120,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m
 PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
 PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
 PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
 static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -166,6 +168,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
 #endif
 MC_LUMA(sse2,sse2,sse2)
 MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
 
 #define GET_REF(name)\
 static uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
@@ -199,6 +202,7 @@ GET_REF(cache64_mmxext)
 GET_REF(sse2)
 GET_REF(sse2_misalign)
 GET_REF(cache64_sse2)
+GET_REF(cache64_ssse3)
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
 void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -344,7 +348,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
     pf->mc_chroma = x264_mc_chroma_ssse3;
     if( cpu&X264_CPU_CACHELINE_64 )
+    {
         pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+        pf->mc_luma = mc_luma_cache64_ssse3;
+        pf->get_ref = get_ref_cache64_ssse3;
+    }
 
     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
         pf->integral_init4v = x264_integral_init4v_ssse3;