[x265] [PATCH] asm: filterConvertPelToShort

Min Chen chenm003 at 163.com
Wed Oct 30 15:47:57 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1383144464 -28800
# Node ID 6bfafdf72eaef415aba43f4579f222cccbac60d9
# Parent  77db80a67f4e55f22bc02ed02930a269bfac6b50
asm: filterConvertPelToShort

diff -r 77db80a67f4e -r 6bfafdf72eae source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Wed Oct 30 22:47:44 2013 +0800
@@ -516,6 +516,9 @@
     int xFrac = mv->x & 0x3;
     int yFrac = mv->y & 0x3;
 
+    assert((width % 4) + (height % 4) == 0);
+    assert(dstStride == MAX_CU_SIZE);
+
     if ((yFrac | xFrac) == 0)
     {
         primitives.ipfilter_p2s(ref, refStride, dst, dstStride, width, height);
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/ipfilter.cpp	Wed Oct 30 22:47:44 2013 +0800
@@ -264,6 +264,24 @@
     }
 }
 
+void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+{
+    int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    int row, col;
+
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int16_t val = src[col] << shift;
+            dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
+        }
+
+        src += srcStride;
+        dst += MAX_CU_SIZE;
+    }
+}
+
 template<int N>
 void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
 {
@@ -471,6 +489,7 @@
 
     p.ipfilter_p2s = filterConvertPelToShort_c;
     p.ipfilter_s2p = filterConvertShortToPel_c;
+    p.luma_p2s = filterConvertPelToShort_c;
 
     p.extendRowBorder = extendCURowColBorder;
 }
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/primitives.h
--- a/source/common/primitives.h	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/primitives.h	Wed Oct 30 22:47:44 2013 +0800
@@ -210,6 +210,7 @@
 
 typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -247,6 +248,7 @@
     filter_pp_t     chroma_vpp[NUM_CHROMA_PARTITIONS];
     filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
     filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
+    filter_p2s_t    luma_p2s;
 
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Oct 30 22:47:44 2013 +0800
@@ -296,6 +296,7 @@
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
+        p.luma_p2s = x265_luma_p2s_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Oct 30 22:47:44 2013 +0800
@@ -78,6 +78,10 @@
                 times 4 dw 58, -10
                 times 4 dw 4, -1
 
+tab_c_128:      times 16 db 0x80
+tab_c_64_n64:   times 8 db 64, -64
+
+
 SECTION .text
 
 %macro FILTER_H4_w2_2 3
@@ -906,6 +910,7 @@
     jnz         .loopH
 
     RET
+
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -2056,3 +2061,69 @@
 FILTER_V4_W32 32, 16
 FILTER_V4_W32 32, 24
 FILTER_V4_W32 32, 32
+
+
+;-----------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal luma_p2s, 3, 7, 8
+
+    ; load width and height
+    mov         r3d, r3m
+    mov         r4d, r4m
+
+    ; load constant
+    mova        m6, [tab_c_128]
+    mova        m7, [tab_c_64_n64]
+
+    ;shr         r4d, 2
+    lea         r2, [r2 - 16]
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m6
+    pmaddubsw   m0, m7
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m6
+    pmaddubsw   m1, m7
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m6
+    pmaddubsw   m2, m7
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m6
+    pmaddubsw   m3, m7
+
+    add         r5, 8
+    cmp         r5, r3
+    jg          .width4
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+    lea         r5, [r5 + 8]
+    je          .nextH
+    jmp         .loopW
+
+.width4:
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 2 * 4
+
+    sub         r4, 4
+    jnz         .loopH
+
+    RET
diff -r 77db80a67f4e -r 6bfafdf72eae source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/common/x86/ipfilter8.h	Wed Oct 30 22:47:44 2013 +0800
@@ -90,6 +90,7 @@
 
 void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
 void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_LUMA_FUNC_DEF
diff -r 77db80a67f4e -r 6bfafdf72eae source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/test/ipfilterharness.cpp	Wed Oct 30 22:47:44 2013 +0800
@@ -240,6 +240,48 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+    int16_t rand_srcStride;
+
+    for (int i = 0; i <= 1000; i++)
+    {
+        int16_t rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
+        int16_t rand_width = (int16_t)rand() % 100;                  // Randomly generated Width
+
+        memset(IPF_vec_output_s, 0, ipf_t_size);      // Initialize output buffer to zero
+        memset(IPF_C_output_s, 0, ipf_t_size);        // Initialize output buffer to zero
+
+        rand_srcStride = rand_width + rand() % 100;              // Randomly generated srcStride
+        if (rand_srcStride < rand_width)
+            rand_srcStride = rand_width;
+
+        rand_width %= 4;
+        if (rand_width < 4)
+            rand_width = 4;
+
+        rand_height %= 4;
+        if (rand_height < 4)
+            rand_height = 4;
+
+        ref(pixel_buff,
+            rand_srcStride,
+            IPF_C_output_s,
+            rand_width,
+            rand_height);
+        opt(pixel_buff,
+            rand_srcStride,
+            IPF_vec_output_s,
+            rand_width,
+            rand_height);
+
+        if (memcmp(IPF_vec_output_s, IPF_C_output_s, ipf_t_size))
+            return false;
+    }
+
+    return true;
+}
+
 bool IPFilterHarness::check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt)
 {
     int16_t rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
@@ -417,6 +459,15 @@
         }
     }
 
+    if (opt.luma_p2s)
+    {
+        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
+        {
+            printf("ipfilter_p2s failed\n");
+            return false;
+        }
+    }
+
     if (opt.ipfilter_s2p)
     {
         if (!check_IPFilter_primitive(ref.ipfilter_s2p, opt.ipfilter_s2p))
diff -r 77db80a67f4e -r 6bfafdf72eae source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Wed Oct 30 15:16:59 2013 +0530
+++ b/source/test/ipfilterharness.h	Wed Oct 30 22:47:44 2013 +0800
@@ -45,6 +45,7 @@
     bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
     bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
     bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
+    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
     bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
     bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);



More information about the x265-devel mailing list