[x265] [PATCH 03 of 29] scale1D_128to64_new primitive: ASM code and unit test code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:11 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1421044413 -19800
#      Mon Jan 12 12:03:33 2015 +0530
# Node ID f3fd0d075bba2320ed8cc93df055b144b5a1b88e
# Parent  cf29bf7824491d35e20df5249810ff9a1520d3e3
scale1D_128to64_new primitive: ASM code and unit test code

diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 12 12:03:33 2015 +0530
@@ -1385,6 +1385,7 @@
     if (cpuMask & X265_CPU_SSSE3)
     {
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+        p.scale1D_128to64_new = x265_scale1D_128to64_new_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
         INTRA_ANG_SSSE3(ssse3);
@@ -1446,6 +1447,7 @@
         p.nquant = x265_nquant_avx2;
         p.dequant_normal  = x265_dequant_normal_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.scale1D_128to64_new = x265_scale1D_128to64_new_avx2;
         p.cu[BLOCK_4x4].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_4_avx2;
         p.cu[BLOCK_8x8].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_8_avx2;
         p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
@@ -1603,6 +1605,7 @@
         INTRA_ANG_SSSE3(ssse3);
 
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+        p.scale1D_128to64_new = x265_scale1D_128to64_new_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
         SAD_X3(ssse3);
         SAD_X4(ssse3);
@@ -1813,6 +1816,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
         p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].copy_ss = x265_blockcopy_ss_16x64_avx;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.scale1D_128to64_new = x265_scale1D_128to64_new_avx2;
 
         p.weight_pp = x265_weight_pp_avx2;
 
diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/pixel-util.h	Mon Jan 12 12:03:33 2015 +0530
@@ -65,6 +65,8 @@
 
 void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
 void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_new_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_new_avx2(pixel*, const pixel*, intptr_t);
 void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Mon Jan 12 12:03:33 2015 +0530
@@ -2892,6 +2892,335 @@
 INIT_XMM avx
 SSIM
 
+%macro SCALE1D_128to64_HBD 0
+    movu        m0,      [r1]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 16]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 32]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 48]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+    punpcklqdq    m0,           m2
+    movu          [r0],         m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 16],    m4
+
+    movu        m0,      [r1 + 64]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 80]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 96]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 112]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+    punpcklqdq    m0,           m2
+    movu          [r0 + 32],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 48],    m4
+
+    movu        m0,      [r1 + 128]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 144]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 160]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 176]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 64],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 80],    m4
+
+    movu        m0,      [r1 + 192]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 208]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 224]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 240]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 96],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 112],    m4
+%endmacro
+
+;-----------------------------------------------------------------
+; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale1D_128to64_new, 2, 2, 8, dest, src1, stride
+%if HIGH_BIT_DEPTH
+    mova        m7,      [deinterleave_word_shuf]
+
+    ;Top pixel
+    SCALE1D_128to64_HBD
+
+    ;Left pixel
+    add         r1,      256
+    add         r0,      128
+    SCALE1D_128to64_HBD
+
+%else
+    mova        m7,      [deinterleave_shuf]
+
+    ;Top pixel
+    movu        m0,      [r1]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + 16]
+    palignr     m3,      m2,    1
+    movu        m4,      [r1 + 32]
+    palignr     m5,      m4,    1
+    movu        m6,      [r1 + 48]
+
+    pavgb       m0,      m1
+
+    palignr     m1,      m6,    1
+
+    pavgb       m2,      m3
+    pavgb       m4,      m5
+    pavgb       m6,      m1
+
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0],         m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 16],    m4
+
+    movu        m0,      [r1 + 64]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + 80]
+    palignr     m3,      m2,    1
+    movu        m4,      [r1 + 96]
+    palignr     m5,      m4,    1
+    movu        m6,      [r1 + 112]
+
+    pavgb       m0,      m1
+
+    palignr     m1,      m6,    1
+
+    pavgb       m2,      m3
+    pavgb       m4,      m5
+    pavgb       m6,      m1
+
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 32],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 48],    m4
+
+    ;Left pixel
+    movu        m0,      [r1 + 128]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + 144]
+    palignr     m3,      m2,    1
+    movu        m4,      [r1 + 160]
+    palignr     m5,      m4,    1
+    movu        m6,      [r1 + 176]
+
+    pavgb       m0,      m1
+
+    palignr     m1,      m6,    1
+
+    pavgb       m2,      m3
+    pavgb       m4,      m5
+    pavgb       m6,      m1
+
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 64],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 80],    m4
+
+    movu        m0,      [r1 + 192]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + 208]
+    palignr     m3,      m2,    1
+    movu        m4,      [r1 + 224]
+    palignr     m5,      m4,    1
+    movu        m6,      [r1 + 240]
+
+    pavgb       m0,      m1
+
+    palignr     m1,      m6,    1
+
+    pavgb       m2,      m3
+    pavgb       m4,      m5
+    pavgb       m6,      m1
+
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 96],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 112],   m4
+%endif
+RET
+
+%if HIGH_BIT_DEPTH == 1
+INIT_YMM avx2
+cglobal scale1D_128to64_new, 2, 2, 3
+    pxor            m2, m2
+
+    ;Top pixel
+    movu            m0, [r1]
+    movu            m1, [r1 + 32]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0], m0
+
+    movu            m0, [r1 + 64]
+    movu            m1, [r1 + 96]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 32], m0
+
+    movu            m0, [r1 + 128]
+    movu            m1, [r1 + 160]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 64], m0
+
+    movu            m0, [r1 + 192]
+    movu            m1, [r1 + 224]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 96], m0
+
+    ;Left pixel
+    movu            m0, [r1 + 256]
+    movu            m1, [r1 + 288]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 128], m0
+
+    movu            m0, [r1 + 320]
+    movu            m1, [r1 + 352]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 160], m0
+
+    movu            m0, [r1 + 384]
+    movu            m1, [r1 + 416]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 192], m0
+
+    movu            m0, [r1 + 448]
+    movu            m1, [r1 + 480]
+    phaddw          m0, m1
+    pavgw           m0, m2
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 224], m0
+
+    RET
+%else ; HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+cglobal scale1D_128to64_new, 2, 2, 4
+    pxor            m2, m2
+    mova            m3, [pb_1]
+
+    ;Top pixel
+    movu            m0, [r1]
+    pmaddubsw       m0, m0, m3
+    pavgw           m0, m2
+    movu            m1, [r1 + 32]
+    pmaddubsw       m1, m1, m3
+    pavgw           m1, m2
+    packuswb        m0, m1
+    vpermq          m0, m0, 0xD8
+    movu            [r0], m0
+
+    movu            m0, [r1 + 64]
+    pmaddubsw       m0, m0, m3
+    pavgw           m0, m2
+    movu            m1, [r1 + 96]
+    pmaddubsw       m1, m1, m3
+    pavgw           m1, m2
+    packuswb        m0, m1
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 32], m0
+
+    ;Left pixel
+    movu            m0, [r1 + 128]
+    pmaddubsw       m0, m0, m3
+    pavgw           m0, m2
+    movu            m1, [r1 + 160]
+    pmaddubsw       m1, m1, m3
+    pavgw           m1, m2
+    packuswb        m0, m1
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 64], m0
+
+    movu            m0, [r1 + 192]
+    pmaddubsw       m0, m0, m3
+    pavgw           m0, m2
+    movu            m1, [r1 + 224]
+    pmaddubsw       m1, m1, m3
+    pavgw           m1, m2
+    packuswb        m0, m1
+    vpermq          m0, m0, 0xD8
+    movu            [r0 + 96], m0
+    RET
+%endif
+
 ;-----------------------------------------------------------------
 ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
 ;-----------------------------------------------------------------
diff -r cf29bf782449 -r f3fd0d075bba source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Jan 12 20:37:35 2015 +0530
+++ b/source/test/pixelharness.cpp	Mon Jan 12 12:03:33 2015 +0530
@@ -708,6 +708,33 @@
     return true;
 }
 
+bool PixelHarness::check_scale_pp_new(scale_t ref, scale_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0, sizeof(ref_dest));
+    memset(opt_dest, 0, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, pixel_test_buff[index] + j, stride);
+        ref(ref_dest, pixel_test_buff[index] + j, stride);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+
 bool PixelHarness::check_transpose(transpose_t ref, transpose_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1539,6 +1566,15 @@
         }
     }
 
+    if (opt.scale1D_128to64_new)
+    {
+        if (!check_scale_pp_new(ref.scale1D_128to64_new, opt.scale1D_128to64_new))
+        {
+            printf("scale1D_128to64_new failed!\n");
+            return false;
+        }
+    }
+
     if (opt.scale2D_64to32)
     {
         if (!check_scale_pp(ref.scale2D_64to32, opt.scale2D_64to32))
@@ -1925,6 +1961,12 @@
         REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1, 64);
     }
 
+    if (opt.scale1D_128to64_new)
+    {
+        HEADER0("scale1D_128to64_new");
+        REPORT_SPEEDUP(opt.scale1D_128to64_new, ref.scale1D_128to64_new, pbuf2, pbuf1, 64);
+    }
+
     if (opt.scale2D_64to32)
     {
         HEADER0("scale2D_64to32");
diff -r cf29bf782449 -r f3fd0d075bba source/test/pixelharness.h
--- a/source/test/pixelharness.h	Mon Jan 12 20:37:35 2015 +0530
+++ b/source/test/pixelharness.h	Mon Jan 12 12:03:33 2015 +0530
@@ -76,6 +76,7 @@
     bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
     bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
     bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
+    bool check_scale_pp_new(scale_t ref, scale_t opt);
     bool check_scale_pp(scale_t ref, scale_t opt);
     bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);


More information about the x265-devel mailing list