[x265] [PATCH] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

Min Chen chenm003 at 163.com
Mon Oct 28 15:40:39 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382971217 -28800
# Node ID 536ff3d483523b9c657e97801ebf8e5ac8ab21f5
# Parent  edc6c8e9a865c9c6a9dad40feff27014f68bc19d
asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 28 22:40:17 2013 +0800
@@ -280,6 +280,7 @@
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+        p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/ipfilter8.asm	Mon Oct 28 22:40:17 2013 +0800
@@ -774,3 +774,114 @@
     jnz         .loopV
 
     RET
+
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(2*4 + 3*gprsize)
+%define old_r0      (rsp + 2 * 4 + 0 * gprsize)
+%define old_r2      (rsp + 2 * 4 + 1 * gprsize)
+%define old_r3      (rsp + 2 * 4 + 2 * gprsize)
+%define old_r4d     (rsp + 0 * 4)
+%define old_6rows   (rsp + 1 * 4)
+
+    mov         r4d,        r4m
+    mov         r5d,        r5m
+
+    ; load coeff table
+    mov         r6d,        r6m
+    shl         r6,         6
+    lea         r6,         [tab_LumaCoeffV + r6]
+
+    mov         [old_r4d], r4d
+    mov         [old_r2], r2
+
+    ; move to -3
+    lea         r1, [r1 * 2]
+    lea         r4, [r1 + r1 * 2]
+    sub         r0, r4
+    lea         r4, [r4 * 2]
+    mov         [old_6rows], r4
+
+.loopH:
+
+    ; load width
+    mov         r4d, [old_r4d]
+
+    ; save old src
+    mov         [old_r0], r0
+
+.loopW:
+
+    movu        m0, [r0]
+    movu        m1, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m2, m0, m1
+    pmaddwd     m2, [r6 + 0 * 16]
+    punpckhwd   m0, m1
+    pmaddwd     m0, [r6 + 0 * 16]
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 1 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 1 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 2 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 2 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 3 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 3 * 16]
+    paddd       m0, m3
+
+    paddd       m2, [tab_c_526336]
+    paddd       m0, [tab_c_526336]
+    psrad       m2, 12
+    psrad       m0, 12
+    packssdw    m2, m0
+    packuswb    m2, m2
+
+    ; move to next 8 col
+    sub         r0, [old_6rows]
+
+    sub         r4, 8
+    jl          .width4
+    movq        [r2], m2
+    je          .nextH
+    lea         r0, [r0 + 16]
+    lea         r2, [r2 + 8]
+    jmp         .loopW
+
+.width4:
+    movd        [r2], m2
+    lea         r0, [r0 + 4]
+
+.nextH:
+    ; move to next row
+    mov         r0, [old_r0]
+    lea         r0, [r0 + r1]
+    add         [old_r2], r3d
+    mov         r2, [old_r2]
+
+    dec         r5d
+    jnz         .loopH
+
+    RET
diff -r edc6c8e9a865 -r 536ff3d48352 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Mon Oct 28 22:39:39 2013 +0800
+++ b/source/common/x86/ipfilter8.h	Mon Oct 28 22:40:17 2013 +0800
@@ -89,6 +89,7 @@
 LUMA_FILTERS(_sse4);
 
 void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_LUMA_FUNC_DEF
diff -r edc6c8e9a865 -r 536ff3d48352 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Mon Oct 28 22:39:39 2013 +0800
+++ b/source/test/ipfilterharness.cpp	Mon Oct 28 22:40:17 2013 +0800
@@ -164,6 +164,8 @@
     int rand_width = rand() % 100;                  // Randomly generated Width
     int16_t rand_val, rand_srcStride, rand_dstStride;
 
+    rand_width &= ~3;
+
     for (int i = 0; i <= 100; i++)
     {
         memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize output buffer to zero
@@ -173,16 +175,16 @@
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
         rand_dstStride = rand() % 100;              // Randomly generated dstStride
 
-        opt(short_buff + 3 * rand_srcStride,
+        ref(short_buff + 3 * rand_srcStride,
             rand_srcStride,
-            IPF_vec_output_p,
+            IPF_C_output_p,
             rand_dstStride,
             rand_width,
             rand_height, rand_val
             );
-        ref(short_buff + 3 * rand_srcStride,
+        opt(short_buff + 3 * rand_srcStride,
             rand_srcStride,
-            IPF_C_output_p,
+            IPF_vec_output_p,
             rand_dstStride,
             rand_width,
             rand_height, rand_val



More information about the x265-devel mailing list