[x265] [PATCH] asm: ssse3 and avx2 for cutree fixed point conversion

Divya Manivannan divya at multicorewareinc.com
Thu Jun 9 12:53:45 CEST 2016


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1465467522 -19800
#      Thu Jun 09 15:48:42 2016 +0530
# Node ID 00860e5a58eb6cf495b9e2c1d9cf6f3c0cc042e8
# Parent  0af296185f7ae3e05493ecf164046ddfec085bb3
asm: ssse3 and avx2 for cutree fixed point conversion

ssse3:
cuTreeFix8Pack    1.99x    3438.00         6841.12
cuTreeFix8Unpack  1.77x    4268.45         7572.87
avx2:
cuTreeFix8Pack    3.44x    2005.33         6905.06
cuTreeFix8Unpack  2.61x    2926.97         7631.64

diff -r 0af296185f7a -r 00860e5a58eb source/common/pixel.cpp
--- a/source/common/pixel.cpp	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/pixel.cpp	Thu Jun 09 15:48:42 2016 +0530
@@ -872,6 +872,22 @@
     }
 }
 
+/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
+static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
+{
+    for (int i = 0; i < count; i++)
+        dst[i] = (uint16_t)(src[i] * 256.0);
+}
+
+static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
+{
+    for (int i = 0; i < count; i++)
+    {
+        int16_t qpFix8 = src[i];
+        dst[i] = (double)(qpFix8) / 256.0;
+    }
+}
+
 #if HIGH_BIT_DEPTH
 static void calcHDRStats_c(pixel *srcY, pixel* srcU, pixel* srcV, intptr_t stride, intptr_t strideC, int width, int height, double *outsum, 
                            pixel *outMax, const pixel minPix, const pixel maxPix, const int hShift, const int vShift)
@@ -1225,5 +1241,7 @@
     p.calcHDRStats = calcHDRStats_c;
 #endif
     p.propagateCost = estimateCUPropagateCost;
+    p.fix8Unpack = cuTreeFix8Unpack;
+    p.fix8Pack = cuTreeFix8Pack;
 }
 }
diff -r 0af296185f7a -r 00860e5a58eb source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/primitives.h	Thu Jun 09 15:48:42 2016 +0530
@@ -189,6 +189,9 @@
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
+typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
+typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
+
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
@@ -314,6 +317,8 @@
 
     downscale_t           frameInitLowres;
     cutree_propagate_cost propagateCost;
+    cutree_fix8_unpack    fix8Unpack;
+    cutree_fix8_pack      fix8Pack;
 
     extendCURowBorder_t   extendRowBorder;
     planecopy_cp_t        planecopy_cp;
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 09 15:48:42 2016 +0530
@@ -1098,6 +1098,8 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -2152,6 +2154,8 @@
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
 
         // TODO: depends on hps and vsp
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
@@ -2452,6 +2456,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -3683,6 +3689,8 @@
         p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
         p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
         p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
 
     }
 #endif
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/mc-a2.asm	Thu Jun 09 15:48:42 2016 +0530
@@ -44,6 +44,10 @@
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif
 
+cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7
+                         db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15
+
+const pq_256,       times 4 dq 256.0
 const pd_inv256,    times 4 dq 0.00390625
 const pd_0_5,       times 4 dq 0.5
 
@@ -1213,3 +1217,121 @@
 
 INIT_YMM avx2
 MBTREE_AVX
+
+
+%macro CUTREE_FIX8 0
+;-----------------------------------------------------------------------------
+; void cutree_fix8_pack( uint16_t *dst, double *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_pack, 3, 4, 5
+    movapd       m2, [pq_256]
+    sub          r2d, mmsize / 2
+    movsxdifnidn r2, r2d
+    lea          r1, [r1 + 8 * r2]
+    lea          r0, [r0 + 2 * r2]
+    neg          r2
+    jg .skip_loop
+.loop:
+    mulpd        m0, m2, [r1 + 8 * r2]
+    mulpd        m1, m2, [r1 + 8 * r2 + mmsize]
+    mulpd        m3, m2, [r1 + 8 * r2 + 2 * mmsize]
+    mulpd        m4, m2, [r1 + 8 * r2 + 3 * mmsize]
+    cvttpd2dq    xm0, m0
+    cvttpd2dq    xm1, m1
+    cvttpd2dq    xm3, m3
+    cvttpd2dq    xm4, m4
+%if mmsize == 32
+    vinserti128  m0, m0, xm3, 1
+    vinserti128  m1, m1, xm4, 1
+    packssdw     m0, m1
+%else
+    punpcklqdq   m0, m1
+    punpcklqdq   m3, m4
+    packssdw     m0, m3
+%endif
+    mova         [r0 + 2 * r2], m0
+    add          r2, mmsize / 2
+    jle .loop
+.skip_loop:
+    sub          r2, mmsize / 2
+    jz .end
+    ; Do the remaining values in scalar in order to avoid overreading src.
+.scalar:
+    movq         xm0, [r1 + 8 * r2 + 4 * mmsize] 
+    mulsd        xm0, xm2
+    cvttsd2si    r3d, xm0
+    mov          [r0 + 2 * r2 + mmsize], r3w
+    inc          r2
+    jl .scalar
+.end:
+    RET
+
+;-----------------------------------------------------------------------------
+; void cutree_fix8_unpack( double *dst, uint16_t *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_unpack, 3, 4, 7
+%if mmsize != 32
+    mova           m4, [cutree_fix8_unpack_shuf+16]
+%endif
+    movapd         m2, [pd_inv256]
+    mova           m3, [cutree_fix8_unpack_shuf]
+    sub            r2d, mmsize / 2
+    movsxdifnidn   r2, r2d
+    lea            r1, [r1 + 2 * r2]
+    lea            r0, [r0 + 8 * r2]
+    neg            r2
+    jg .skip_loop
+.loop:
+%if mmsize == 32
+    vbroadcasti128 m0, [r1 + 2 * r2]
+    vbroadcasti128 m1, [r1 + 2 * r2 + 16]
+    pshufb         m0, m3
+    pshufb         m1, m3
+%else
+    mova           m1, [r1 + 2 * r2]
+    pshufb         m0, m1, m3
+    pshufb         m1, m4
+%endif
+    psrad          m0, 16 ; sign-extend
+    psrad          m1, 16
+    cvtdq2pd       m5, xm0
+    cvtdq2pd       m6, xm1
+%if mmsize == 32
+    vpermq         m0, m0, q1032
+    vpermq         m1, m1, q1032
+%else
+    psrldq         m0, 8
+    psrldq         m1, 8
+%endif
+    cvtdq2pd       m0, xm0
+    cvtdq2pd       m1, xm1
+    mulpd          m0, m2
+    mulpd          m1, m2
+    mulpd          m5, m2
+    mulpd          m6, m2
+    movapd         [r0 + 8 * r2], m5
+    movapd         [r0 + 8 * r2 + mmsize], m0
+    movapd         [r0 + 8 * r2 + mmsize * 2], m6
+    movapd         [r0 + 8 * r2 + mmsize * 3], m1
+    add            r2, mmsize / 2
+    jle .loop
+.skip_loop:
+    sub            r2, mmsize / 2
+    jz .end
+.scalar:
+    movzx          r3d, word [r1 + 2 * r2 + mmsize]
+    movsx          r3d, r3w
+    cvtsi2sd       xm0, r3d
+    mulsd          xm0, xm2
+    movsd          [r0 + 8 * r2 + 4 * mmsize], xm0
+    inc            r2
+    jl .scalar
+.end:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+CUTREE_FIX8
+
+INIT_YMM avx2
+CUTREE_FIX8
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/mc.h
--- a/source/common/x86/mc.h	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/mc.h	Thu Jun 09 15:48:42 2016 +0530
@@ -46,4 +46,20 @@
 
 #undef PROPAGATE_COST
 
+#define FIX8UNPACK(cpu) \
+    void PFX(cutree_fix8_unpack_ ## cpu)(double *dst, uint16_t *src, int count);
+
+FIX8UNPACK(ssse3)
+FIX8UNPACK(avx2)
+
+#undef FIX8UNPACK
+
+#define FIX8PACK(cpu) \
+    void PFX(cutree_fix8_pack_## cpu)(uint16_t *dst, double *src, int count);
+
+FIX8PACK(ssse3)
+FIX8PACK(avx2)
+
+#undef FIX8PACK
+
 #endif // ifndef X265_MC_H
diff -r 0af296185f7a -r 00860e5a58eb source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/encoder/ratecontrol.cpp	Thu Jun 09 15:48:42 2016 +0530
@@ -1432,12 +1432,9 @@
             }
             while(type != sliceTypeActual);
         }
+        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
         for (int i = 0; i < m_ncu; i++)
-        {
-            int16_t qpFix8 = m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos][i];
-            frame->m_lowres.qpCuTreeOffset[i] = (double)(qpFix8) / 256.0;
             frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
-        }
         m_cuTreeStats.qpBufPos--;
     }
     return true;
@@ -2596,8 +2593,7 @@
     if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead)
     {
         uint8_t sliceType = (uint8_t)rce->sliceType;
-        for (int i = 0; i < m_ncu; i++)
-                m_cuTreeStats.qpBuffer[0][i] = (uint16_t)(curFrame->m_lowres.qpCuTreeOffset[i] * 256.0);
+        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu);
         if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
             goto writeFailure;
         if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu)
diff -r 0af296185f7a -r 00860e5a58eb source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/test/pixelharness.cpp	Thu Jun 09 15:48:42 2016 +0530
@@ -43,6 +43,7 @@
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
         residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
+        double_test_buff[0][i]  = (double)(short_test_buff[0][i]) / 256.0;
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
@@ -52,6 +53,7 @@
         ushort_test_buff[1][i]  = PIXEL_MIN;
         uchar_test_buff[1][i]   = PIXEL_MIN;
         residual_test_buff[1][i] = RMIN;
+        double_test_buff[1][i]  = (double)(short_test_buff[1][i]) / 256.0;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
@@ -61,6 +63,7 @@
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
         uchar_test_buff[2][i]   = 255;
         residual_test_buff[2][i] = RMAX;
+        double_test_buff[2][i] = (double)(short_test_buff[2][i]) / 256.0;
 
         pbuf1[i] = rand() & PIXEL_MAX;
         pbuf2[i] = rand() & PIXEL_MAX;
@@ -1397,6 +1400,60 @@
     return true;
 }
 
+bool PixelHarness::check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt)
+{
+    ALIGN_VAR_32(uint16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_32(uint16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int count = 256 + i;
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, double_test_buff[index] + j, count);
+        ref(ref_dest, double_test_buff[index] + j, count);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(uint16_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt)
+{
+    ALIGN_VAR_32(double, ref_dest[64 * 64]);
+    ALIGN_VAR_32(double, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int count = 256 + i;
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, ushort_test_buff[index] + j, count);
+        ref(ref_dest, ushort_test_buff[index] + j, count);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(double)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
 {
     int j = 0, index1, index2, optres, refres;
@@ -2531,6 +2588,24 @@
         }
     }
 
+    if (opt.fix8Pack)
+    {
+        if (!check_cutree_fix8_pack(ref.fix8Pack, opt.fix8Pack))
+        {
+            printf("cuTreeFix8Pack failed\n");
+            return false;
+        }
+    }
+
+    if (opt.fix8Unpack)
+    {
+        if (!check_cutree_fix8_unpack(ref.fix8Unpack, opt.fix8Unpack))
+        {
+            printf("cuTreeFix8Unpack failed\n");
+            return false;
+        }
+    }
+
     if (opt.scanPosLast)
     {
         if (!check_scanPosLast(ref.scanPosLast, opt.scanPosLast))
@@ -3030,6 +3105,18 @@
         REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
     }
 
+    if (opt.fix8Pack)
+    {
+        HEADER0("cuTreeFix8Pack");
+        REPORT_SPEEDUP(opt.fix8Pack, ref.fix8Pack, ushort_test_buff[0], double_test_buff[0], 390);
+    }
+
+    if (opt.fix8Unpack)
+    {
+        HEADER0("cuTreeFix8Unpack");
+        REPORT_SPEEDUP(opt.fix8Unpack, ref.fix8Unpack, double_test_buff[0], ushort_test_buff[0], 390);
+    }
+
     if (opt.scanPosLast)
     {
         HEADER0("scanPosLast");
diff -r 0af296185f7a -r 00860e5a58eb source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Jun 07 09:20:11 2016 +0530
+++ b/source/test/pixelharness.h	Thu Jun 09 15:48:42 2016 +0530
@@ -113,6 +113,8 @@
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
+    bool check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt);
+    bool check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);


More information about the x265-devel mailing list