[x265] [PATCH Review only] asm: added psy_acEnergy_pp_4x4 in sse4 for psyCost_pp
Divya Manivannan
divya at multicorewareinc.com
Wed Dec 17 06:58:33 CET 2014
The encoder uses 4x4 block size by testing with these two videos
NebutaFestival_2560x1600_60_10bit_crop.yuv and 720p50_parkrun_ter.y4m.
Since there is no similar function definition, I have generated the new
function definition for 4x4.
On Tue, Dec 16, 2014 at 10:55 PM, chen <chenm003 at 163.com> wrote:
>
>
>
>
> At 2014-12-16 18:35:49,"Divya Manivannan" <divya at multicorewareinc.com> wrote:
> ># HG changeset patch
> ># User Divya Manivannan <divya at multicorewareinc.com>
> ># Date 1418726099 -19800
> ># Tue Dec 16 16:04:59 2014 +0530
> ># Node ID de6f39b44c144aa56c68d27d6ee201e7dd493755
> ># Parent 775ebb4694ad7931a98b796640bf646085659ea2
> >asm: added psy_acEnergy_pp_4x4 in sse4 for psyCost_pp
> >
> >diff -r 775ebb4694ad -r de6f39b44c14 source/common/pixel.cpp
> >--- a/source/common/pixel.cpp Tue Dec 16 09:40:00 2014 +0530
> >+++ b/source/common/pixel.cpp Tue Dec 16 16:04:59 2014 +0530
> >@@ -795,8 +795,18 @@
> > else
> > {
> > /* 4x4 is too small for sa8d */
> >- int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
> >- int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
> >+ int sourceEnergy, reconEnergy;
> >+ if (!HIGH_BIT_DEPTH) // once HBD asm code is developed, if condition will go away
>
> #if
>
>
>
> >+ {
> >+ sourceEnergy = primitives.psy_acEnergy_pp(source, sstride);
> >+ reconEnergy = primitives.psy_acEnergy_pp(recon, rstride);
> >+ }
> >+ else
> >+ {
> >+ //original code;
> >+ sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
> >+ reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
>
> reference code may put into primitives C model
>
> >+ }
> > return abs(sourceEnergy - reconEnergy);
> > }
> > }
> >diff -r 775ebb4694ad -r de6f39b44c14 source/common/primitives.h
> >--- a/source/common/primitives.h Tue Dec 16 09:40:00 2014 +0530
> >+++ b/source/common/primitives.h Tue Dec 16 16:04:59 2014 +0530
> >@@ -195,6 +195,7 @@
> > typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
> >
> > typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
> >+typedef int(*psy_acEnergy_pp_t)(const pixel* pix, intptr_t stride);
> >
> > /* Define a structure containing function pointers to optimized encoder
> > * primitives. Each pointer can reference either an assembly routine,
> >@@ -213,6 +214,7 @@
> > pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
> > pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in AC energy between two blocks
> > pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
> >+ psy_acEnergy_pp_t psy_acEnergy_pp;
> >
> > dct_t dct[NUM_DCTS];
> > idct_t idct[NUM_IDCTS];
> >diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp Tue Dec 16 09:40:00 2014 +0530
> >+++ b/source/common/x86/asm-primitives.cpp Tue Dec 16 16:04:59 2014 +0530
> >@@ -1898,6 +1898,9 @@
> > p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;
> > p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;
> > #endif
> >+
> >+ p.psy_acEnergy_pp = x265_psy_acEnergy_pp_4x4_sse4;
> >+
> > }
> > #endif // if HIGH_BIT_DEPTH
> > }
> >diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/pixel-a.asm
> >--- a/source/common/x86/pixel-a.asm Tue Dec 16 09:40:00 2014 +0530
> >+++ b/source/common/x86/pixel-a.asm Tue Dec 16 16:04:59 2014 +0530
> >@@ -6579,3 +6579,35 @@
> > mov [r2], r3w
> > .end:
> > RET
> >+
> >+;---------------------------------------------------------------------------------------------------------------------
> >+;int psy_acEnergy_pp(const pixel* source, intptr_t sstride)
> >+;---------------------------------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal psy_acEnergy_pp_4x4, 2, 3, 6
> >+
> >+ lea r2, [3 * r1]
> >+ movd m0, [r0]
> >+ movd m1, [r0 + r1]
> >+ movd m2, [r0 + r1 * 2]
> >+ movd m3, [r0 + r2]
> >+ shufps m0, m1, 0
>
> overwrite m0 with m1 lowest 32-bits?
>
> the compute logic is wrong below
>
>
>
> >+ shufps m2, m3, 0
> >+ mova m4, [hmul_4p]
> >+ pmaddubsw m0, m4
> >+ pmaddubsw m2, m4
> >+
> >+ paddw m5, m0, m2
> >+ movhlps m4, m5
> >+ paddw m5, m4
> >+ phaddw m5, m5
>
> pmaddwd may replace this phaddw to avoid Port5, just another way, need analyze output object code.
>
>
>
> >+ pmovzxwd m5, m5
>
> reduce by above pmaddwd
>
>
>
> >+ psrld m5, 2
> >+
> >+ HADAMARD 0, sumsub, 0, 2, 1, 3
> >+ HADAMARD 4, sumsub, 0, 2, 1, 3
> >+ HADAMARD 1, amax, 0, 2, 1, 3
> >+ HADDW m0, m2
> >+ psubd m0, m5
> >+ movd eax, m0
> >+ RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141217/09c5cbb1/attachment.html>
More information about the x265-devel
mailing list