<div dir="ltr">The encoder uses 4x4 block size by testing with these two videos NebutaFestival_2560x1600_60_10bit_crop.yuv and 720p50_parkrun_ter.y4m.<div>Since there is no similar function definition, I have generated the new function definition for 4x4.</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Dec 16, 2014 at 10:55 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2014-12-16 18:35:49,"Divya Manivannan" <<a href="mailto:divya@multicorewareinc.com" target="_blank">divya@multicorewareinc.com</a>> wrote:


># HG changeset patch


># User Divya Manivannan <<a href="mailto:divya@multicorewareinc.com" target="_blank">divya@multicorewareinc.com</a>>


># Date 1418726099 -19800


>#      Tue Dec 16 16:04:59 2014 +0530


># Node ID de6f39b44c144aa56c68d27d6ee201e7dd493755


># Parent  775ebb4694ad7931a98b796640bf646085659ea2


>asm: added psy_acEnergy_pp_4x4 in sse4 for psyCost_pp


>


>diff -r 775ebb4694ad -r de6f39b44c14 source/common/pixel.cpp


>--- a/source/common/pixel.cpp      Tue Dec 16 09:40:00 2014 +0530


>+++ b/source/common/pixel.cpp      Tue Dec 16 16:04:59 2014 +0530


>@@ -795,8 +795,18 @@


>     else


>     {


>         /* 4x4 is too small for sa8d */


>-        int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);


>-        int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);


>+        int sourceEnergy, reconEnergy;


>+        if (!HIGH_BIT_DEPTH)    // once HBD asm code is developed, if condition will go away


</pre><pre>#if</pre><span class=""><pre> </pre><pre>>+        {


>+            sourceEnergy = primitives.psy_acEnergy_pp(source, sstride);


>+            reconEnergy = primitives.psy_acEnergy_pp(recon, rstride);


>+        }


>+        else


>+        {


>+            //original code;


>+            sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);


>+            reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);


</pre></span><pre>reference code may put into primitives C model</pre><div><div class="h5"><pre>>+        }


>         return abs(sourceEnergy - reconEnergy);


>     }


> }


>diff -r 775ebb4694ad -r de6f39b44c14 source/common/primitives.h


>--- a/source/common/primitives.h   Tue Dec 16 09:40:00 2014 +0530


>+++ b/source/common/primitives.h   Tue Dec 16 16:04:59 2014 +0530


>@@ -195,6 +195,7 @@


> typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);


> 


> typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);


>+typedef int(*psy_acEnergy_pp_t)(const pixel* pix, intptr_t stride);


> 


> /* Define a structure containing function pointers to optimized encoder


>  * primitives.  Each pointer can reference either an assembly routine,


>@@ -213,6 +214,7 @@


>     pixelcmp_t            sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks


>     pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS];  // difference in AC energy between two blocks


>     pixelcmp_ss_t         psy_cost_ss[NUM_SQUARE_BLOCKS];


>+    psy_acEnergy_pp_t     psy_acEnergy_pp;


> 


>     dct_t                 dct[NUM_DCTS];


>     idct_t                idct[NUM_IDCTS];


>diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/asm-primitives.cpp


>--- a/source/common/x86/asm-primitives.cpp Tue Dec 16 09:40:00 2014 +0530


>+++ b/source/common/x86/asm-primitives.cpp Tue Dec 16 16:04:59 2014 +0530


>@@ -1898,6 +1898,9 @@


>         p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = x265_interp_4tap_vert_pp_16x16_avx2;


>         p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = x265_interp_4tap_vert_pp_32x32_avx2;


> #endif


>+


>+        p.psy_acEnergy_pp = x265_psy_acEnergy_pp_4x4_sse4;


>+


>     }


> #endif // if HIGH_BIT_DEPTH


> }


>diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/pixel-a.asm


>--- a/source/common/x86/pixel-a.asm        Tue Dec 16 09:40:00 2014 +0530


>+++ b/source/common/x86/pixel-a.asm        Tue Dec 16 16:04:59 2014 +0530


>@@ -6579,3 +6579,35 @@


>     mov         [r2], r3w


> .end:


>     RET


>+


>+;---------------------------------------------------------------------------------------------------------------------


>+;int psy_acEnergy_pp(const pixel* source, intptr_t sstride)


>+;---------------------------------------------------------------------------------------------------------------------


>+INIT_XMM sse4


>+cglobal psy_acEnergy_pp_4x4, 2, 3, 6


>+


>+    lea             r2, [3 * r1]


>+    movd            m0, [r0]


>+    movd            m1, [r0 + r1]


>+    movd            m2, [r0 + r1 * 2]


>+    movd            m3, [r0 + r2]


>+    shufps          m0, m1, 0


</pre></div></div><pre>overwrite m0 with m1 lowest 32-bits?</pre><pre>the compute logic is wrong below</pre><span class=""><pre> </pre><pre>>+    shufps          m2, m3, 0


>+    mova            m4, [hmul_4p]


>+    pmaddubsw       m0, m4


>+    pmaddubsw       m2, m4


>+


>+    paddw           m5, m0, m2


>+    movhlps         m4, m5


>+    paddw           m5, m4


>+    phaddw          m5, m5


</pre></span><pre>pmaddwd may replace this phaddw to avoid Port5, just another way, need analyze output object code.</pre><pre> </pre><pre>>+    pmovzxwd        m5, m5


</pre><pre>reduce by above pmaddwd</pre><span class=""><pre> </pre><pre>>+    psrld           m5, 2


>+


>+    HADAMARD 0, sumsub, 0, 2, 1, 3


>+    HADAMARD 4, sumsub, 0, 2, 1, 3


>+    HADAMARD 1, amax, 0, 2, 1, 3


>+    HADDW m0, m2


>+    psubd           m0, m5


>+    movd            eax, m0


>+    RET


</pre></span></div><br>_______________________________________________<br>


x265-devel mailing list<br>


<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>


<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>


<br></blockquote></div></div>