[x265] [PATCH 1 of 4] asm: intra_filter4x4 sse4 code and added testbench support, improved 357c->141c over C code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Jun 26 15:22:49 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435323067 -19800
#      Fri Jun 26 18:21:07 2015 +0530
# Node ID 44b574b61b29a3cfba99e8f0d06622e44a86df17
# Parent  d64227e54233d1646c55bcb4b0b831e5340009ed
asm: intra_filter4x4 sse4 code and added testbench support, improved 357c->141c over C code

diff -r d64227e54233 -r 44b574b61b29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 26 18:21:07 2015 +0530
@@ -2453,6 +2453,8 @@
         p.weight_pp = PFX(weight_pp_sse4);
         p.weight_sp = PFX(weight_sp_sse4);
 
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         ALL_LUMA_TU(intra_pred_allangs, all_angs_pred, sse4);
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred.h	Fri Jun 26 18:21:07 2015 +0530
@@ -66,6 +66,7 @@
 
 #define DECL_ALL(cpu) \
     FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
     DECL_ANGS(4, cpu); \
     DECL_ANGS(8, cpu); \
     DECL_ANGS(16, cpu); \
diff -r d64227e54233 -r 44b574b61b29 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Jun 26 18:21:07 2015 +0530
@@ -30,6 +30,9 @@
 intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+intra_filter4_shuf0:  db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf1:  db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+
 pb_0_8        times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
 pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
@@ -18276,3 +18279,44 @@
 
     INTRA_PRED_STORE_4x4
     RET
+
+;-----------------------------------------------------------------------------------
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+    mov             r2b, byte [r0 +  8]             ; topLast
+    mov             r3b, byte [r0 + 16]             ; LeftLast
+
+    ; filtering top
+    pmovzxbw        m0, [r0 +  0]
+    pmovzxbw        m1, [r0 +  8]
+    pmovzxbw        m2, [r0 + 16]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
+    palignr         m3, m1, m0, 4
+    pshufb          m3, [intra_filter4_shuf1]       ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    ; filtering left
+    palignr         m4, m1, m1, 14                  ; [14 13 12 11 10 9 8 15] samples[i - 1]
+    pinsrb          m4, [r0], 2                     ; [14 13 12 11 10 9 0 15] samples[i + 1]
+    palignr         m3, m2, m1, 4
+    pshufb          m3, [intra_filter4_shuf1]
+
+    psllw           m1, 1
+    paddw           m4, m3
+    paddw           m1, m4
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+    packuswb        m0, m1
+
+    movu            [r1], m0
+    mov             [r1 +  8], r2b                  ; topLast
+    mov             [r1 + 16], r3b                  ; LeftLast
+    RET
diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/test/intrapredharness.cpp	Fri Jun 26 18:21:07 2015 +0530
@@ -31,6 +31,16 @@
 {
     for (int i = 0; i < INPUT_SIZE; i++)
         pixel_buff[i] = rand() % PIXEL_MAX;
+
+    /* [0] --- Random values
+     * [1] --- Minimum
+     * [2] --- Maximum */
+    for (int i = 0; i < BUFFSIZE; i++)
+    {
+        pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
+        pixel_test_buff[1][i]   = PIXEL_MIN;
+        pixel_test_buff[2][i]   = PIXEL_MAX;
+    }
 }
 
 bool IntraPredHarness::check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width)
@@ -177,6 +187,27 @@
     return true;
 }
 
+bool IntraPredHarness::check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt)
+{
+    memset(pixel_out_c, 0, 64 * 64 * sizeof(pixel));
+    memset(pixel_out_vec, 0, 64 * 64 * sizeof(pixel));
+    int j = 0;
+
+    for (int i = 0; i < 100; i++)
+    {
+        int index = rand() % TEST_CASES;
+
+        ref(pixel_test_buff[index] + j, pixel_out_c);
+        checked(opt, pixel_test_buff[index] + j, pixel_out_vec);
+
+        if (memcmp(pixel_out_c, pixel_out_vec, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += FENC_STRIDE;
+    }
+    return true;
+}
 bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int i = BLOCK_4x4; i <= BLOCK_32x32; i++)
@@ -213,6 +244,14 @@
                 return false;
             }
         }
+        if (opt.cu[i].intra_filter)
+        {
+            if (!check_intra_filter_primitive(ref.cu[i].intra_filter, opt.cu[i].intra_filter))
+            {
+                printf("intra_filter_%dx%d failed\n", size, size);
+                return false;
+            }
+        }
     }
 
     return true;
@@ -268,5 +307,10 @@
                                pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, mode, bFilter);
             }
         }
+        if (opt.cu[i].intra_filter)
+        {
+            printf("intra_filter_%dx%d", size, size);
+            REPORT_SPEEDUP(opt.cu[i].intra_filter, ref.cu[i].intra_filter, pixel_buff, pixel_out_c);
+        }
     }
 }
diff -r d64227e54233 -r 44b574b61b29 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/test/intrapredharness.h	Fri Jun 26 18:21:07 2015 +0530
@@ -34,7 +34,15 @@
     enum { INPUT_SIZE = 4 * 65 * 65 * 100 };
     enum { OUTPUT_SIZE = 64 * FENC_STRIDE };
     enum { OUTPUT_SIZE_33 = 33 * OUTPUT_SIZE };
+    enum { TEST_CASES = 3 };
+    enum { INCR = 32 };
+    enum { STRIDE = 64 };
+    enum { ITERS = 100 };
+    enum { MAX_HEIGHT = 64 };
+    enum { PAD_ROWS = 64 };
+    enum { BUFFSIZE = STRIDE * (MAX_HEIGHT + PAD_ROWS) + INCR * ITERS };
 
+    pixel    pixel_test_buff[TEST_CASES][BUFFSIZE];
     ALIGN_VAR_16(pixel, pixel_buff[INPUT_SIZE]);
     pixel pixel_out_c[OUTPUT_SIZE];
     pixel pixel_out_vec[OUTPUT_SIZE];
@@ -45,6 +53,7 @@
     bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
     bool check_angular_primitive(const intra_pred_t ref[], const intra_pred_t opt[], int size);
     bool check_allangs_primitive(const intra_allangs_t ref, const intra_allangs_t opt, int size);
+    bool check_intra_filter_primitive(const intra_filter_t ref, const intra_filter_t opt);
 
 public:
 


More information about the x265-devel mailing list