[x265] [PATCH 4 of 4] asm: chroma_p2s to replace ipfilter_p2s

Min Chen chenm003 at 163.com
Thu Oct 31 14:03:02 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1383224503 -28800
# Node ID 4a40c4069ad12bc72a1c443b45a91c65d319d35d
# Parent  21dbf988079b0e33265ae48578c26347cc779fbe
asm: chroma_p2s to replace ipfilter_p2s

diff -r 21dbf988079b -r 4a40c4069ad1 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Thu Oct 31 21:01:43 2013 +0800
@@ -619,10 +619,13 @@
     uint32_t cxWidth = width >> 1;
     uint32_t cxHeight = height >> 1;
 
+    assert(dstStride == MAX_CU_SIZE / 2);
+    assert(((cxWidth | cxHeight) % 2) == 0);
+
     if ((yFrac | xFrac) == 0)
     {
-        primitives.ipfilter_p2s(refCb, refStride, dstCb, dstStride, cxWidth, cxHeight);
-        primitives.ipfilter_p2s(refCr, refStride, dstCr, dstStride, cxWidth, cxHeight);
+        primitives.chroma_p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
+        primitives.chroma_p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
     }
     else if (yFrac == 0)
     {
diff -r 21dbf988079b -r 4a40c4069ad1 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/common/ipfilter.cpp	Thu Oct 31 21:01:43 2013 +0800
@@ -264,6 +264,7 @@
     }
 }
 
+template<int dstStride>
 void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
 {
     int shift = IF_INTERNAL_PREC - X265_DEPTH;
@@ -278,7 +279,7 @@
         }
 
         src += srcStride;
-        dst += MAX_CU_SIZE;
+        dst += dstStride;
     }
 }
 
@@ -489,7 +490,8 @@
 
     p.ipfilter_p2s = filterConvertPelToShort_c;
     p.ipfilter_s2p = filterConvertShortToPel_c;
-    p.luma_p2s = filterConvertPelToShort_c;
+    p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
+    p.chroma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE/2>;
 
     p.extendRowBorder = extendCURowColBorder;
 }
diff -r 21dbf988079b -r 4a40c4069ad1 source/common/primitives.h
--- a/source/common/primitives.h	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/common/primitives.h	Thu Oct 31 21:01:43 2013 +0800
@@ -254,6 +254,7 @@
     filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
     filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
     filter_p2s_t    luma_p2s;
+    filter_p2s_t    chroma_p2s;
 
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 31 21:01:43 2013 +0800
@@ -318,6 +318,7 @@
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
         p.luma_p2s = x265_luma_p2s_ssse3;
+        p.chroma_p2s = x265_chroma_p2s_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/common/x86/ipfilter8.asm	Thu Oct 31 21:01:43 2013 +0800
@@ -2124,3 +2124,61 @@
     jnz         .loopH
 
     RET
+
+
+; TODO: combin of U and V is more performance, but need more register
+; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
+INIT_XMM ssse3
+cglobal chroma_p2s, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, r3m
+    mov         r4d, r4m
+
+    ; load constant
+    mova        m4, [tab_c_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m6
+    pmaddubsw   m1, m7
+
+    add         r5d, 8
+    cmp         r5d, r3d
+    lea         r6, [r2 + r5 * 2]
+    jg          .width2
+    movu        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+    movu        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+    je          .nextH
+    jmp         .loopW
+
+.width4:
+    cmp         r3d, 4
+    jl          .width2
+    movh        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+    movh        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+    lea         r6, [r6 + 8]
+    jz          .nextH
+
+.width2:
+    movd        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+    movd        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+
+.nextH:
+    lea         r0, [r0 + r1 * 2]
+    add         r2, FENC_STRIDE / 2 * 4
+
+    sub         r4d, 2
+    jnz         .loopH
+
+    RET
diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/common/x86/ipfilter8.h	Thu Oct 31 21:01:43 2013 +0800
@@ -91,6 +91,7 @@
 void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
 void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
 void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_LUMA_FUNC_DEF
diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/test/ipfilterharness.cpp	Thu Oct 31 21:01:43 2013 +0800
@@ -240,14 +240,15 @@
     return true;
 }
 
-bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt)
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma)
 {
-    int16_t rand_srcStride;
+    intptr_t rand_srcStride;
+    const int min_size = isChroma ? 2 : 4;
 
     for (int i = 0; i <= 1000; i++)
     {
-        int16_t rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
-        int16_t rand_width = (int16_t)rand() % 100;                  // Randomly generated Width
+        int rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
+        int rand_width = (int16_t)rand() % 100;                  // Randomly generated Width
 
         memset(IPF_vec_output_s, 0, ipf_t_size);      // Initialize output buffer to zero
         memset(IPF_C_output_s, 0, ipf_t_size);        // Initialize output buffer to zero
@@ -256,13 +257,13 @@
         if (rand_srcStride < rand_width)
             rand_srcStride = rand_width;
 
-        rand_width %= 4;
-        if (rand_width < 4)
-            rand_width = 4;
+        rand_width %= min_size;
+        if (rand_width < min_size)
+            rand_width = min_size;
 
-        rand_height %= 4;
-        if (rand_height < 4)
-            rand_height = 4;
+        rand_height %= min_size;
+        if (rand_height < min_size)
+            rand_height = min_size;
 
         ref(pixel_buff,
             rand_srcStride,
@@ -461,7 +462,16 @@
 
     if (opt.luma_p2s)
     {
-        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
+        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s, 0))
+        {
+            printf("ipfilter_p2s failed\n");
+            return false;
+        }
+    }
+
+    if (opt.chroma_p2s)
+    {
+        if (!check_IPFilter_primitive(ref.chroma_p2s, opt.chroma_p2s, 1))
         {
             printf("ipfilter_p2s failed\n");
             return false;
@@ -586,6 +596,13 @@
                        pixel_buff, srcStride, IPF_vec_output_s, width, height);
     }
 
+    if (opt.chroma_p2s)
+    {
+        printf("chroma_p2s\t");
+        REPORT_SPEEDUP(opt.chroma_p2s, ref.chroma_p2s,
+                       pixel_buff, srcStride, IPF_vec_output_s, width, height);
+    }
+
     if (opt.ipfilter_s2p)
     {
         printf("ipfilter_s2p\t");
diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Thu Oct 31 21:01:29 2013 +0800
+++ b/source/test/ipfilterharness.h	Thu Oct 31 21:01:43 2013 +0800
@@ -45,7 +45,7 @@
     bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
     bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
     bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
-    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
+    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma);
     bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
     bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);



More information about the x265-devel mailing list