[x265] [PATCH] asm: ssse3 and avx2 for cutree fixed point conversion
Divya Manivannan
divya at multicorewareinc.com
Thu Jun 9 12:53:45 CEST 2016
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1465467522 -19800
# Thu Jun 09 15:48:42 2016 +0530
# Node ID 00860e5a58eb6cf495b9e2c1d9cf6f3c0cc042e8
# Parent 0af296185f7ae3e05493ecf164046ddfec085bb3
asm: ssse3 and avx2 for cutree fixed point conversion
ssse3:
cuTreeFix8Pack 1.99x 3438.00 6841.12
cuTreeFix8Unpack 1.77x 4268.45 7572.87
avx2:
cuTreeFix8Pack 3.44x 2005.33 6905.06
cuTreeFix8Unpack 2.61x 2926.97 7631.64
diff -r 0af296185f7a -r 00860e5a58eb source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/pixel.cpp Thu Jun 09 15:48:42 2016 +0530
@@ -872,6 +872,22 @@
}
}
+/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
+static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
+{
+ for (int i = 0; i < count; i++)
+ dst[i] = (uint16_t)(src[i] * 256.0);
+}
+
+static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
+{
+ for (int i = 0; i < count; i++)
+ {
+ int16_t qpFix8 = src[i];
+ dst[i] = (double)(qpFix8) / 256.0;
+ }
+}
+
#if HIGH_BIT_DEPTH
static void calcHDRStats_c(pixel *srcY, pixel* srcU, pixel* srcV, intptr_t stride, intptr_t strideC, int width, int height, double *outsum,
pixel *outMax, const pixel minPix, const pixel maxPix, const int hShift, const int vShift)
@@ -1225,5 +1241,7 @@
p.calcHDRStats = calcHDRStats_c;
#endif
p.propagateCost = estimateCUPropagateCost;
+ p.fix8Unpack = cuTreeFix8Unpack;
+ p.fix8Pack = cuTreeFix8Pack;
}
}
diff -r 0af296185f7a -r 00860e5a58eb source/common/primitives.h
--- a/source/common/primitives.h Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/primitives.h Thu Jun 09 15:48:42 2016 +0530
@@ -189,6 +189,9 @@
typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
+typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
+
typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
@@ -314,6 +317,8 @@
downscale_t frameInitLowres;
cutree_propagate_cost propagateCost;
+ cutree_fix8_unpack fix8Unpack;
+ cutree_fix8_pack fix8Pack;
extendCURowBorder_t extendRowBorder;
planecopy_cp_t planecopy_cp;
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 09 15:48:42 2016 +0530
@@ -1098,6 +1098,8 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+ p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+ p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -2152,6 +2154,8 @@
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+ p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+ p.fix8Pack = PFX(cutree_fix8_pack_avx2);
// TODO: depends on hps and vsp
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
@@ -2452,6 +2456,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
+ p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
+ p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -3683,6 +3689,8 @@
p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
+ p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
+ p.fix8Pack = PFX(cutree_fix8_pack_avx2);
}
#endif
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/mc-a2.asm Thu Jun 09 15:48:42 2016 +0530
@@ -44,6 +44,10 @@
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif
+cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7
+ db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15
+
+const pq_256, times 4 dq 256.0
const pd_inv256, times 4 dq 0.00390625
const pd_0_5, times 4 dq 0.5
@@ -1213,3 +1217,121 @@
INIT_YMM avx2
MBTREE_AVX
+
+
+%macro CUTREE_FIX8 0
+;-----------------------------------------------------------------------------
+; void cutree_fix8_pack( uint16_t *dst, double *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_pack, 3, 4, 5
+ movapd m2, [pq_256]
+ sub r2d, mmsize / 2
+ movsxdifnidn r2, r2d
+ lea r1, [r1 + 8 * r2]
+ lea r0, [r0 + 2 * r2]
+ neg r2
+ jg .skip_loop
+.loop:
+ mulpd m0, m2, [r1 + 8 * r2]
+ mulpd m1, m2, [r1 + 8 * r2 + mmsize]
+ mulpd m3, m2, [r1 + 8 * r2 + 2 * mmsize]
+ mulpd m4, m2, [r1 + 8 * r2 + 3 * mmsize]
+ cvttpd2dq xm0, m0
+ cvttpd2dq xm1, m1
+ cvttpd2dq xm3, m3
+ cvttpd2dq xm4, m4
+%if mmsize == 32
+ vinserti128 m0, m0, xm3, 1
+ vinserti128 m1, m1, xm4, 1
+ packssdw m0, m1
+%else
+ punpcklqdq m0, m1
+ punpcklqdq m3, m4
+ packssdw m0, m3
+%endif
+ mova [r0 + 2 * r2], m0
+ add r2, mmsize / 2
+ jle .loop
+.skip_loop:
+ sub r2, mmsize / 2
+ jz .end
+ ; Do the remaining values in scalar in order to avoid overreading src.
+.scalar:
+ movq xm0, [r1 + 8 * r2 + 4 * mmsize]
+ mulsd xm0, xm2
+ cvttsd2si r3d, xm0
+ mov [r0 + 2 * r2 + mmsize], r3w
+ inc r2
+ jl .scalar
+.end:
+ RET
+
+;-----------------------------------------------------------------------------
+; void cutree_fix8_unpack( double *dst, uint16_t *src, int count )
+;-----------------------------------------------------------------------------
+cglobal cutree_fix8_unpack, 3, 4, 7
+%if mmsize != 32
+ mova m4, [cutree_fix8_unpack_shuf+16]
+%endif
+ movapd m2, [pd_inv256]
+ mova m3, [cutree_fix8_unpack_shuf]
+ sub r2d, mmsize / 2
+ movsxdifnidn r2, r2d
+ lea r1, [r1 + 2 * r2]
+ lea r0, [r0 + 8 * r2]
+ neg r2
+ jg .skip_loop
+.loop:
+%if mmsize == 32
+ vbroadcasti128 m0, [r1 + 2 * r2]
+ vbroadcasti128 m1, [r1 + 2 * r2 + 16]
+ pshufb m0, m3
+ pshufb m1, m3
+%else
+ mova m1, [r1 + 2 * r2]
+ pshufb m0, m1, m3
+ pshufb m1, m4
+%endif
+ psrad m0, 16 ; sign-extend
+ psrad m1, 16
+ cvtdq2pd m5, xm0
+ cvtdq2pd m6, xm1
+%if mmsize == 32
+ vpermq m0, m0, q1032
+ vpermq m1, m1, q1032
+%else
+ psrldq m0, 8
+ psrldq m1, 8
+%endif
+ cvtdq2pd m0, xm0
+ cvtdq2pd m1, xm1
+ mulpd m0, m2
+ mulpd m1, m2
+ mulpd m5, m2
+ mulpd m6, m2
+ movapd [r0 + 8 * r2], m5
+ movapd [r0 + 8 * r2 + mmsize], m0
+ movapd [r0 + 8 * r2 + mmsize * 2], m6
+ movapd [r0 + 8 * r2 + mmsize * 3], m1
+ add r2, mmsize / 2
+ jle .loop
+.skip_loop:
+ sub r2, mmsize / 2
+ jz .end
+.scalar:
+ movzx r3d, word [r1 + 2 * r2 + mmsize]
+ movsx r3d, r3w
+ cvtsi2sd xm0, r3d
+ mulsd xm0, xm2
+ movsd [r0 + 8 * r2 + 4 * mmsize], xm0
+ inc r2
+ jl .scalar
+.end:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+CUTREE_FIX8
+
+INIT_YMM avx2
+CUTREE_FIX8
diff -r 0af296185f7a -r 00860e5a58eb source/common/x86/mc.h
--- a/source/common/x86/mc.h Tue Jun 07 09:20:11 2016 +0530
+++ b/source/common/x86/mc.h Thu Jun 09 15:48:42 2016 +0530
@@ -46,4 +46,20 @@
#undef PROPAGATE_COST
+#define FIX8UNPACK(cpu) \
+ void PFX(cutree_fix8_unpack_ ## cpu)(double *dst, uint16_t *src, int count);
+
+FIX8UNPACK(ssse3)
+FIX8UNPACK(avx2)
+
+#undef FIX8UNPACK
+
+#define FIX8PACK(cpu) \
+ void PFX(cutree_fix8_pack_## cpu)(uint16_t *dst, double *src, int count);
+
+FIX8PACK(ssse3)
+FIX8PACK(avx2)
+
+#undef FIX8PACK
+
#endif // ifndef X265_MC_H
diff -r 0af296185f7a -r 00860e5a58eb source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp Tue Jun 07 09:20:11 2016 +0530
+++ b/source/encoder/ratecontrol.cpp Thu Jun 09 15:48:42 2016 +0530
@@ -1432,12 +1432,9 @@
}
while(type != sliceTypeActual);
}
+ primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
for (int i = 0; i < m_ncu; i++)
- {
- int16_t qpFix8 = m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos][i];
- frame->m_lowres.qpCuTreeOffset[i] = (double)(qpFix8) / 256.0;
frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
- }
m_cuTreeStats.qpBufPos--;
}
return true;
@@ -2596,8 +2593,7 @@
if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead)
{
uint8_t sliceType = (uint8_t)rce->sliceType;
- for (int i = 0; i < m_ncu; i++)
- m_cuTreeStats.qpBuffer[0][i] = (uint16_t)(curFrame->m_lowres.qpCuTreeOffset[i] * 256.0);
+ primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu);
if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
goto writeFailure;
if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu)
diff -r 0af296185f7a -r 00860e5a58eb source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Jun 07 09:20:11 2016 +0530
+++ b/source/test/pixelharness.cpp Thu Jun 09 15:48:42 2016 +0530
@@ -43,6 +43,7 @@
ushort_test_buff[0][i] = rand() % ((1 << 16) - 1);
uchar_test_buff[0][i] = rand() % ((1 << 8) - 1);
residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
+ double_test_buff[0][i] = (double)(short_test_buff[0][i]) / 256.0;
pixel_test_buff[1][i] = PIXEL_MIN;
short_test_buff[1][i] = SMIN;
@@ -52,6 +53,7 @@
ushort_test_buff[1][i] = PIXEL_MIN;
uchar_test_buff[1][i] = PIXEL_MIN;
residual_test_buff[1][i] = RMIN;
+ double_test_buff[1][i] = (double)(short_test_buff[1][i]) / 256.0;
pixel_test_buff[2][i] = PIXEL_MAX;
short_test_buff[2][i] = SMAX;
@@ -61,6 +63,7 @@
ushort_test_buff[2][i] = ((1 << 16) - 1);
uchar_test_buff[2][i] = 255;
residual_test_buff[2][i] = RMAX;
+ double_test_buff[2][i] = (double)(short_test_buff[2][i]) / 256.0;
pbuf1[i] = rand() & PIXEL_MAX;
pbuf2[i] = rand() & PIXEL_MAX;
@@ -1397,6 +1400,60 @@
return true;
}
+bool PixelHarness::check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt)
+{
+ ALIGN_VAR_32(uint16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_32(uint16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int count = 256 + i;
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, double_test_buff[index] + j, count);
+ ref(ref_dest, double_test_buff[index] + j, count);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(uint16_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt)
+{
+ ALIGN_VAR_32(double, ref_dest[64 * 64]);
+ ALIGN_VAR_32(double, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int count = 256 + i;
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, ushort_test_buff[index] + j, count);
+ ref(ref_dest, ushort_test_buff[index] + j, count);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(double)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
{
int j = 0, index1, index2, optres, refres;
@@ -2531,6 +2588,24 @@
}
}
+ if (opt.fix8Pack)
+ {
+ if (!check_cutree_fix8_pack(ref.fix8Pack, opt.fix8Pack))
+ {
+ printf("cuTreeFix8Pack failed\n");
+ return false;
+ }
+ }
+
+ if (opt.fix8Unpack)
+ {
+ if (!check_cutree_fix8_unpack(ref.fix8Unpack, opt.fix8Unpack))
+ {
+ printf("cuTreeFix8Unpack failed\n");
+ return false;
+ }
+ }
+
if (opt.scanPosLast)
{
if (!check_scanPosLast(ref.scanPosLast, opt.scanPosLast))
@@ -3030,6 +3105,18 @@
REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
}
+ if (opt.fix8Pack)
+ {
+ HEADER0("cuTreeFix8Pack");
+ REPORT_SPEEDUP(opt.fix8Pack, ref.fix8Pack, ushort_test_buff[0], double_test_buff[0], 390);
+ }
+
+ if (opt.fix8Unpack)
+ {
+ HEADER0("cuTreeFix8Unpack");
+ REPORT_SPEEDUP(opt.fix8Unpack, ref.fix8Unpack, double_test_buff[0], ushort_test_buff[0], 390);
+ }
+
if (opt.scanPosLast)
{
HEADER0("scanPosLast");
diff -r 0af296185f7a -r 00860e5a58eb source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Jun 07 09:20:11 2016 +0530
+++ b/source/test/pixelharness.h Thu Jun 09 15:48:42 2016 +0530
@@ -113,6 +113,8 @@
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
+ bool check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt);
+ bool check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt);
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
bool check_calSign(sign_t ref, sign_t opt);
bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
More information about the x265-devel
mailing list