[x265] [PATCH] replace sse_sp(residual, ZERO) by ssd_s(residual)
Min Chen
chenm003 at 163.com
Wed Jul 16 02:51:38 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1405471890 25200
# Node ID 78f7b217e5d53ab981bb0b5ac0f43e8c46260c9f
# Parent c923f4a9494619665bf49db7ae0e250e2f8c4ec7
replace sse_sp(residual, ZERO) by ssd_s(residual)
diff -r c923f4a94946 -r 78f7b217e5d5 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Jul 14 17:27:04 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Jul 15 17:51:30 2014 -0700
@@ -2374,9 +2374,8 @@
if ((cu->getSlice()->getPPS()->getTransquantBypassEnableFlag()))
{
bIsTQBypassEnable = true; // mark that the first iteration is to cost TQB mode.
- tqBypassMode = 2;
- if (m_param->bLossless)
- tqBypassMode = 1;
+ if (!m_param->bLossless)
+ tqBypassMode = 2;
}
uint64_t bestCost = MAX_INT64;
@@ -2814,7 +2813,8 @@
}
int partSize = partitionFromLog2Size(log2TrSize);
- uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, 0);
+ assert(log2TrSize <= 5);
+ uint32_t distY = primitives.ssd_s[log2TrSize - 2](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width);
uint32_t psyEnergyY = 0;
if (m_rdCost.psyRdEnabled())
{
@@ -2923,7 +2923,7 @@
int16_t *curResiU = m_qtTempShortYuv[qtLayer].getCbAddr(absPartIdxC);
int16_t *curResiV = m_qtTempShortYuv[qtLayer].getCrAddr(absPartIdxC);
- distU = m_rdCost.scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
+ distU = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth));
if (outZeroDist)
*outZeroDist += distU;
@@ -3008,7 +3008,7 @@
if (!numSigU[tuIterator.section])
primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
- distV = m_rdCost.scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
+ distV = m_rdCost.scaleChromaDistCr(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth));
if (outZeroDist)
*outZeroDist += distV;
diff -r c923f4a94946 -r 78f7b217e5d5 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Jul 14 17:27:04 2014 +0530
+++ b/source/common/pixel.cpp Tue Jul 15 17:51:30 2014 -0700
@@ -375,6 +375,21 @@
return cost;
}
+template<int size>
+int pixel_ssd_s_c(short *a, intptr_t dstride)
+{
+ int sum = 0;
+ for (int y = 0; y < size; y++)
+ {
+ for (int x = 0; x < size; x++)
+ {
+ sum += a[x] * a[x];
+ }
+ a += dstride;
+ }
+ return sum;
+}
+
void blockcopy_p_p(int bx, int by, pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
@@ -1200,6 +1215,11 @@
p.transpose[BLOCK_32x32] = transpose<32>;
p.transpose[BLOCK_64x64] = transpose<64>;
+ p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
+ p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
+ p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
+ p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
+
p.weight_pp = weight_pp_c;
p.weight_sp = weight_sp_c;
diff -r c923f4a94946 -r 78f7b217e5d5 source/common/primitives.h
--- a/source/common/primitives.h Mon Jul 14 17:27:04 2014 +0530
+++ b/source/common/primitives.h Tue Jul 15 17:51:30 2014 -0700
@@ -130,6 +130,7 @@
typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
+typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
@@ -204,6 +205,7 @@
pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed
pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed
+ pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed
pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD)
pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 14 17:27:04 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 15 17:51:30 2014 -0700
@@ -1024,6 +1024,12 @@
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
+ // TODO: overflow on 12-bits mode!
+ p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
+ p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
+ p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
+ p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
+
p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
@@ -1156,6 +1162,11 @@
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
+ p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
+ p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
+ p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
+ p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
+
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
@@ -1315,6 +1326,7 @@
INIT2_NAME(sse_pp, ssd, _avx2);
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
+ p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Jul 14 17:27:04 2014 +0530
+++ b/source/common/x86/pixel.h Tue Jul 15 17:51:30 2014 -0700
@@ -166,6 +166,12 @@
int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
+int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
+
#define ADDAVG(func) \
void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
ADDAVG(addAvg_2x4)
diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Mon Jul 14 17:27:04 2014 +0530
+++ b/source/common/x86/ssd-a.asm Tue Jul 15 17:51:30 2014 -0700
@@ -2395,3 +2395,224 @@
HADDD m7, m1
movd eax, m7
RET
+
+
+;-----------------------------------------------------------------------------
+; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixel_ssd_s_4, 2,2,2
+ add r1, r1
+ movh m0, [r0]
+ movhps m0, [r0 + r1]
+
+ lea r0, [r0 + r1 * 2]
+ movh m1, [r0]
+ movhps m1, [r0 + r1]
+
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+
+ ; calculate sum
+ paddd m0, m1
+ movhlps m1, m0
+ paddd m0, m1
+ pshufd m1, m0, 1
+ paddd m0, m1
+
+ movd eax, m0
+ RET
+
+
+INIT_XMM sse2
+cglobal pixel_ssd_s_8, 2,3,5
+ add r1, r1
+ lea r2, [r1 * 3]
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r2]
+
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, m2
+
+ lea r0, [r0 + r1 * 4]
+ movu m4, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r2]
+
+ pmaddwd m4, m4
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m4, m1
+ paddd m2, m3
+ paddd m4, m2
+
+ ; calculate sum
+ paddd m0, m4
+ movhlps m1, m0
+ paddd m0, m1
+ pshufd m1, m0, 1
+ paddd m0, m1
+
+ movd eax, m0
+ RET
+
+
+INIT_XMM sse2
+cglobal pixel_ssd_s_16, 2,3,5
+ add r1, r1
+
+ mov r2d, 4
+ pxor m0, m0
+.loop:
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ movu m3, [r0 + r1]
+ movu m4, [r0 + r1 + mmsize]
+ lea r0, [r0 + r1 * 2]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ movu m3, [r0 + r1]
+ movu m4, [r0 + r1 + mmsize]
+ lea r0, [r0 + r1 * 2]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ dec r2d
+ jnz .loop
+
+ ; calculate sum
+ movhlps m1, m0
+ paddd m0, m1
+ pshufd m1, m0, 1
+ paddd m0, m1
+
+ movd eax, m0
+ RET
+
+
+INIT_XMM sse2
+cglobal pixel_ssd_s_32, 2,3,5
+ add r1, r1
+
+ mov r2d, 16
+ pxor m0, m0
+.loop:
+ movu m1, [r0 + 0 * mmsize]
+ movu m2, [r0 + 1 * mmsize]
+ movu m3, [r0 + 2 * mmsize]
+ movu m4, [r0 + 3 * mmsize]
+ add r0, r1
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ movu m1, [r0 + 0 * mmsize]
+ movu m2, [r0 + 1 * mmsize]
+ movu m3, [r0 + 2 * mmsize]
+ movu m4, [r0 + 3 * mmsize]
+ add r0, r1
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ dec r2d
+ jnz .loop
+
+ ; calculate sum
+ movhlps m1, m0
+ paddd m0, m1
+ pshufd m1, m0, 1
+ paddd m0, m1
+
+ movd eax, m0
+ RET
+
+
+INIT_YMM avx2
+cglobal pixel_ssd_s_32, 2,4,5
+ add r1, r1
+ lea r3, [r1 * 3]
+
+ mov r2d, 8
+ pxor m0, m0
+.loop:
+ movu m1, [r0 + 0 * mmsize]
+ movu m2, [r0 + 1 * mmsize]
+ movu m3, [r0 + r1 + 0 * mmsize]
+ movu m4, [r0 + r1 + 1 * mmsize]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ movu m1, [r0 + r1 * 2 + 0 * mmsize]
+ movu m2, [r0 + r1 * 2 + 1 * mmsize]
+ movu m3, [r0 + r3 + 0 * mmsize]
+ movu m4, [r0 + r3 + 1 * mmsize]
+ lea r0, [r0 + 4 * r1]
+
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m1, m3
+ paddd m0, m1
+
+ dec r2d
+ jnz .loop
+
+ ; calculate sum
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ movhlps xm1, xm0
+ paddd xm0, xm1
+ pshufd xm1, xm0, 1
+ paddd xm0, xm1
+
+ movd eax, xm0
+ RET
diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jul 14 17:27:04 2014 +0530
+++ b/source/test/pixelharness.cpp Tue Jul 15 17:51:30 2014 -0700
@@ -394,6 +394,28 @@
return true;
}
+bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
+{
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ // NOTE: stride must be multiple of 16, because minimum block is 4x4
+ int stride = (STRIDE + (rand() % STRIDE)) & ~15;
+ int cres = ref(sbuf1 + j, stride);
+ int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
+
+ if (cres != vres)
+ {
+ return false;
+ }
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1312,6 +1334,15 @@
}
}
+ if ((i <= BLOCK_32x32) && opt.ssd_s[i])
+ {
+ if (!check_ssd_s(ref.ssd_s[i], opt.ssd_s[i]))
+ {
+ printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+
if (opt.blockfill_s[i])
{
if (!check_blockfill_s(ref.blockfill_s[i], opt.blockfill_s[i]))
@@ -1656,6 +1687,11 @@
for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
{
+ if ((i <= BLOCK_32x32) && opt.ssd_s[i])
+ {
+ HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.ssd_s[i], ref.ssd_s[i], sbuf1, STRIDE);
+ }
if (opt.sa8d[i])
{
HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jul 14 17:27:04 2014 +0530
+++ b/source/test/pixelharness.h Tue Jul 15 17:51:30 2014 -0700
@@ -53,6 +53,7 @@
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
bool check_pixeladd_ss(pixeladd_ss_t ref, pixeladd_ss_t opt);
bool check_scale_pp(scale_t ref, scale_t opt);
+ bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
bool check_calcrecon(calcrecon_t ref, calcrecon_t opt);
More information about the x265-devel
mailing list