[x265] [PATCH Review only] asm & testbench: psyCost_pp_4x4 in sse4: improve 2088c->337c

Divya Manivannan divya at multicorewareinc.com
Mon Dec 29 09:19:58 CET 2014


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1419841142 -19800
#      Mon Dec 29 13:49:02 2014 +0530
# Node ID b2960999295f668030756deb53ce08a50e7af7ca
# Parent  1bf769c6953d7c4f660d26a8618083ac1c0885e5
asm & testbench: psyCost_pp_4x4 in sse4: improve 2088c->337c

diff -r 1bf769c6953d -r b2960999295f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Dec 29 13:49:02 2014 +0530
@@ -1434,6 +1434,8 @@
 
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
+
+        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1715,6 +1717,8 @@
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
 //        p.denoiseDct = x265_denoise_dct_sse4;
+
+        p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 1bf769c6953d -r b2960999295f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Dec 29 13:49:02 2014 +0530
@@ -41,6 +41,8 @@
 hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
 mask_10:   times 4 dw 0, -1
 mask_1100: times 2 dd 0, -1
+hmul_8w:   times 4 dw 1
+           times 2 dw 1, -1
 
 ALIGN 32
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
@@ -6579,3 +6581,165 @@
     mov         [r2], r3w
 .end:
     RET
+
+%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
+%if cpuflag(ssse3)
+    pabsd   %1, %3
+    pabsd   %2, %4
+%elifidn %1, %3
+    pxor    %5, %5
+    pxor    %6, %6
+    psubd   %5, %1
+    psubd   %6, %2
+    pmaxsd  %1, %5
+    pmaxsd  %2, %6
+%else
+    pxor    %1, %1
+    pxor    %2, %2
+    psubd   %1, %3
+    psubd   %2, %4
+    pmaxsd  %1, %3
+    pmaxsd  %2, %4
+%endif
+%endmacro
+
+;---------------------------------------------------------------------------------------------------------------------
+;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal psyCost_pp_4x4, 4, 5, 8
+
+%if HIGH_BIT_DEPTH
+    FIX_STRIDES r1, r3
+    lea             r4, [3 * r1]
+    movddup         m0, [r0]
+    movddup         m1, [r0 + r1]
+    movddup         m2, [r0 + r1 * 2]
+    movddup         m3, [r0 + r4]
+    mova            m4, [hmul_8w]
+    pmaddwd         m0, m4
+    pmaddwd         m1, m4
+    pmaddwd         m2, m4
+    pmaddwd         m3, m4
+
+    paddd           m5, m0, m1
+    paddd           m5, m2
+    paddd           m5, m3
+    psrldq          m4, m5, 4
+    paddd           m5, m4
+    psrld           m5, 2
+
+    SUMSUB_BA d, 0, 1, 4
+    SUMSUB_BA d, 2, 3, 4
+    SUMSUB_BA d, 0, 2, 4
+    SUMSUB_BA d, 1, 3, 4
+    %define ORDER unord
+    TRANS q, ORDER, 0, 2, 4, 6
+    TRANS q, ORDER, 1, 3, 4, 6
+    ABSD2 m0, m2, m0, m2, m4, m6
+    pmaxsd          m0, m2
+    ABSD2 m1, m3, m1, m3, m4, m6
+    pmaxsd          m1, m3
+    paddd           m0, m1
+    movhlps         m1, m0
+    paddd           m0, m1
+    psrldq          m1, m0, 4
+    paddd           m0, m1
+
+    psubd           m7, m0, m5
+
+    lea             r4, [3 * r3]
+    movddup         m0, [r2]
+    movddup         m1, [r2 + r3]
+    movddup         m2, [r2 + r3 * 2]
+    movddup         m3, [r2 + r4]
+    mova            m4, [hmul_8w]
+    pmaddwd         m0, m4
+    pmaddwd         m1, m4
+    pmaddwd         m2, m4
+    pmaddwd         m3, m4
+
+    paddd           m5, m0, m1
+    paddd           m5, m2
+    paddd           m5, m3
+    psrldq          m4, m5, 4
+    paddd           m5, m4
+    psrld           m5, 2
+
+    SUMSUB_BA d, 0, 1, 4
+    SUMSUB_BA d, 2, 3, 4
+    SUMSUB_BA d, 0, 2, 4
+    SUMSUB_BA d, 1, 3, 4
+    %define ORDER unord
+    TRANS q, ORDER, 0, 2, 4, 6
+    TRANS q, ORDER, 1, 3, 4, 6
+    ABSD2 m0, m2, m0, m2, m4, m6
+    pmaxsd          m0, m2
+    ABSD2 m1, m3, m1, m3, m4, m6
+    pmaxsd          m1, m3
+    paddd           m0, m1
+    movhlps         m1, m0
+    paddd           m0, m1
+    psrldq          m1, m0, 4
+    paddd           m0, m1
+
+    psubd           m0, m5
+
+    psubd           m7, m0
+    pabsd           m0, m7
+    movd            eax, m0
+
+%else ; !HIGH_BIT_DEPTH
+    lea             r4, [3 * r1]
+    movd            m0, [r0]
+    movd            m1, [r0 + r1]
+    movd            m2, [r0 + r1 * 2]
+    movd            m3, [r0 + r4]
+    shufps          m0, m1, 0
+    shufps          m2, m3, 0
+    mova            m4, [hmul_4p]
+    pmaddubsw       m0, m4
+    pmaddubsw       m2, m4
+
+    paddw           m5, m0, m2
+    movhlps         m4, m5
+    paddw           m5, m4
+    pmaddwd         m5, [pw_1]
+    psrld           m5, 2
+
+    HADAMARD 0, sumsub, 0, 2, 1, 3
+    HADAMARD 4, sumsub, 0, 2, 1, 3
+    HADAMARD 1, amax, 0, 2, 1, 3
+    HADDW m0, m2
+
+    psubd           m6, m0, m5
+
+    lea             r4, [3 * r3]
+    movd            m0, [r2]
+    movd            m1, [r2 + r3]
+    movd            m2, [r2 + r3 * 2]
+    movd            m3, [r2 + r4]
+    shufps          m0, m1, 0
+    shufps          m2, m3, 0
+    mova            m4, [hmul_4p]
+    pmaddubsw       m0, m4
+    pmaddubsw       m2, m4
+
+    paddw           m5, m0, m2
+    movhlps         m4, m5
+    paddw           m5, m4
+    pmaddwd         m5, [pw_1]
+    psrld           m5, 2
+
+    HADAMARD 0, sumsub, 0, 2, 1, 3
+    HADAMARD 4, sumsub, 0, 2, 1, 3
+    HADAMARD 1, amax, 0, 2, 1, 3
+    HADDW m0, m2
+
+    psubd           m0, m5
+
+    psubd           m6, m0
+    pabsd           m0, m6
+    movd            eax, m0
+%endif ; HIGH_BIT_DEPTH
+    RET
diff -r 1bf769c6953d -r b2960999295f source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Dec 24 12:31:27 2014 +0530
+++ b/source/common/x86/pixel.h	Mon Dec 29 13:49:02 2014 +0530
@@ -218,6 +218,7 @@
 
 void x265_downShift_16_sse2(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void x265_upShift_8_sse4(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+int x265_psyCost_pp_4x4_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
diff -r 1bf769c6953d -r b2960999295f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Dec 24 12:31:27 2014 +0530
+++ b/source/test/pixelharness.cpp	Mon Dec 29 13:49:02 2014 +0530
@@ -948,6 +948,28 @@
     return true;
 }
 
+bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
+{
+    int j = 0, index1, index2, optres, refres;
+    intptr_t stride = STRIDE;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        index1 = rand() % TEST_CASES;
+        index2 = rand() % TEST_CASES;
+        optres = (int)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        refres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+
+        if (optres != refres)
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.satd[part])
@@ -1290,6 +1312,15 @@
                 return false;
             }
         }
+
+        if (opt.psy_cost_pp[i])
+        {
+            if (!check_psyCost_pp(ref.psy_cost_pp[i], opt.psy_cost_pp[i]))
+            {
+                printf("\npsy_cost_pp[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
     }
 
     if (opt.weight_pp)
@@ -1631,6 +1662,12 @@
             HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
             REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
         }
+
+        if (opt.psy_cost_pp[i])
+        {
+            HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i], pbuf1, STRIDE, pbuf2, STRIDE);
+        }
     }
 
     if (opt.weight_pp)
diff -r 1bf769c6953d -r b2960999295f source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Dec 24 12:31:27 2014 +0530
+++ b/source/test/pixelharness.h	Mon Dec 29 13:49:02 2014 +0530
@@ -92,6 +92,7 @@
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
+    bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
 
 public:
 


More information about the x265-devel mailing list