<div dir="ltr"><div dir="ltr"><div># HG changeset patch</div><div># User Akil Ayyappan<<a href="mailto:akil@multicorewareinc.com" target="_blank">akil@multicorewareinc.com</a>></div><div># Date 1551251102 -19800</div><div># Wed Feb 27 12:35:02 2019 +0530</div><div># Node ID d12a4caf7963fd47d646040689ad5f02754ad879</div><div># Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979</div><div>x86: ssimDistortion primitive</div><div><br></div><div>This patch adds AVX2 assembly for this primitive.</div><div><br></div><div>Pushed patch to default branch of x265 repo.</div><div><br></div><div>Thanks & Regards,<br>Dinesh</div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Tue, Mar 5, 2019 at 10:03 AM Akil <<a href="mailto:akil@multicorewareinc.com">akil@multicorewareinc.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div dir="ltr"><div dir="ltr"><br clear="all"><div><div># HG changeset patch</div><div># User Akil Ayyappan<<a href="mailto:akil@multicorewareinc.com" target="_blank">akil@multicorewareinc.com</a>></div><div># Date 1551251102 -19800</div><div># Wed Feb 27 12:35:02 2019 +0530</div><div># Node ID d12a4caf7963fd47d646040689ad5f02754ad879</div><div># Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979</div><div>x86: ssimDistortion primitive</div><div><br></div><div>This patch adds AVX2 assembly for this primitive.</div><div><br></div><div>|---------|-----------|-----------------|-----------------|</div><div>| Size |Performance|AVX2 clock cycles|CPP clock cycles |</div><div>|---------|-----------|-----------------|-----------------|</div><div>| [4x4] | 3.52x | 264.43 | 932.05 |</div><div>| [8x8] | 5.11x | 619.24 | 3163.56 |</div><div>| [16x16] | 5.44x | 2114.00 | 11490.52 |</div><div>| [32x32] | 6.01x | 7589.70 | 45608.01 |</div><div>| [64x64] | 6.70x | 27859.21 | 186634.25 |</div><div>|---------|-----------|-----------------|-----------------|</div><div><br></div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp</div><div>--- a/source/common/pixel.cpp<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/pixel.cpp<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -934,6 +934,31 @@</div><div> }</div><div> }</div><div> </div><div>+template<int log2TrSize></div><div>+static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)</div><div>+{</div><div>+ *ssBlock = 0;</div><div>+ const uint32_t trSize = 1 << log2TrSize;</div><div>+ for (int y = 0; y < trSize; y++)</div><div>+ {</div><div>+ for (int x = 0; x < trSize; x++)</div><div>+ {</div><div>+ int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff</div><div>+ *ssBlock += temp * temp;</div><div>+ }</div><div>+ }</div><div>+</div><div>+ *ac_k = 0;</div><div>+ for (int block_yy = 0; block_yy < trSize; block_yy += 1)</div><div>+ {</div><div>+ for (int block_xx = 0; block_xx < trSize; block_xx += 1)</div><div>+ {</div><div>+ uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;</div><div>+ *ac_k += temp * temp;</div><div>+ }</div><div>+ }</div><div>+}</div><div>+</div><div> #if HIGH_BIT_DEPTH</div><div> static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, </div><div> const pixel minPix, const pixel maxPix)</div><div>@@ -1283,5 +1308,11 @@</div><div> p.propagateCost = estimateCUPropagateCost;</div><div> p.fix8Unpack = cuTreeFix8Unpack;</div><div> p.fix8Pack = cuTreeFix8Pack;</div><div>+</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].ssimDist = ssimDist_c<2>;</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].ssimDist = ssimDist_c<3>;</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].ssimDist = ssimDist_c<4>;</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].ssimDist = ssimDist_c<5>;</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].ssimDist = ssimDist_c<6>;</div><div> }</div><div> }</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h</div><div>--- a/source/common/primitives.h<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/primitives.h<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -227,6 +227,7 @@</div><div> typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);</div><div> typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);</div><div> typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);</div><div>+typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);</div><div> /* Function pointers to optimized encoder primitives. Each pointer can reference</div><div> * either an assembly routine, a SIMD intrinsic primitive, or a C function */</div><div> struct EncoderPrimitives</div><div>@@ -303,6 +304,7 @@</div><div> psyRdoQuant_t psyRdoQuant;</div><div> <span style="white-space:pre-wrap"> </span>psyRdoQuant_t1 psyRdoQuant_1p;</div><div> <span style="white-space:pre-wrap"> </span>psyRdoQuant_t2 psyRdoQuant_2p;</div><div>+ ssimDistortion_t ssimDist;</div><div> }</div><div> cu[NUM_CU_SIZES];</div><div> /* These remaining primitives work on either fixed block sizes or take</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp</div><div>--- a/source/common/quant.cpp<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/quant.cpp<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -501,15 +501,8 @@</div><div> </div><div> // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC</div><div> ssBlock = 0;</div><div>- for (int y = 0; y < trSize; y++)</div><div>- {</div><div>- for (int x = 0; x < trSize; x++)</div><div>- {</div><div>- int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff</div><div>- ssBlock += temp * temp;</div><div>- }</div><div>- }</div><div>-</div><div>+ uint64_t ac_k = 0;</div><div>+ <a href="http://primitives.cu" target="_blank">primitives.cu</a>[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, &ssBlock, shift, &ac_k);</div><div> ssAc = ssBlock - ssDc;</div><div> </div><div> // 1. Calculation of fdc'</div><div>@@ -535,15 +528,6 @@</div><div> uint64_t fAc_num = 0;</div><div> </div><div> // 2. Calculate ac component</div><div>- uint64_t ac_k = 0;</div><div>- for (int block_yy = 0; block_yy < trSize; block_yy += 1)</div><div>- {</div><div>- for (int block_xx = 0; block_xx < trSize; block_xx += 1)</div><div>- {</div><div>- uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;</div><div>- ac_k += temp * temp;</div><div>- }</div><div>- }</div><div> ac_k -= dc_k;</div><div> </div><div> double s = 1 + 0.005 * cu.m_qp[absPartIdx];</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/asm-primitives.cpp</div><div>--- a/source/common/x86/asm-primitives.cpp<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/x86/asm-primitives.cpp<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -2319,6 +2319,12 @@</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);</div><div> </div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);</div><div>+</div><div> /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only </div><div> p.planeClipAndMax = PFX(planeClipAndMax_avx2); */</div><div> </div><div>@@ -4706,6 +4712,12 @@</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);</div><div> </div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);</div><div>+</div><div> }</div><div> if (cpuMask & X265_CPU_AVX512)</div><div> {</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel-a.asm</div><div>--- a/source/common/x86/pixel-a.asm<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/x86/pixel-a.asm<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -73,6 +73,16 @@</div><div> cextern pb_movemask_32</div><div> cextern pw_pixel_max</div><div> </div><div>+%if BIT_DEPTH == 12</div><div>+ %define SSIMRD_SHIFT 4</div><div>+%elif BIT_DEPTH == 10</div><div>+ %define SSIMRD_SHIFT 2</div><div>+%elif BIT_DEPTH == 8</div><div>+ %define SSIMRD_SHIFT 0</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div> ;=============================================================================</div><div> ; SATD</div><div> ;=============================================================================</div><div>@@ -360,6 +370,24 @@</div><div> RET</div><div> %endmacro</div><div> </div><div>+%macro SSIM_RD_COL 2</div><div>+ vpsrld m6, m0, SSIMRD_SHIFT</div><div>+ vpsubd m0, m1</div><div>+</div><div>+ vpmuldq m2, m0, m0</div><div>+ vpsrldq m0, m0, 4</div><div>+ vpmuldq m0, m0, m0</div><div>+ vpaddq m0, m2</div><div>+</div><div>+ vpmuldq m2, m6, m6</div><div>+ vpsrldq m6, m6, 4</div><div>+ vpmuldq m6, m6, m6</div><div>+ vpaddq m6, m2</div><div>+</div><div>+ vpaddq m4, m0</div><div>+ vpaddq m7, m6</div><div>+%endmacro</div><div>+</div><div> ; FIXME avoid the spilling of regs to hold 3*stride.</div><div> ; for small blocks on x86_32, modify pixel pointer instead.</div><div> </div><div>@@ -15883,3 +15911,395 @@</div><div> RET</div><div> %endif</div><div> %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10</div><div>+</div><div>+;template<int log2TrSize></div><div>+;static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)</div><div>+;{</div><div>+; *ssBlock = 0;</div><div>+; const uint32_t trSize = 1 << log2TrSize;</div><div>+; for (int y = 0; y < trSize; y++)</div><div>+; {</div><div>+; for (int x = 0; x < trSize; x++)</div><div>+; {</div><div>+; int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff</div><div>+; *ssBlock += temp * temp;</div><div>+; }</div><div>+; }</div><div>+;</div><div>+; *ac_k = 0;</div><div>+; for (int block_yy = 0; block_yy < trSize; block_yy += 1)</div><div>+; {</div><div>+; for (int block_xx = 0; block_xx < trSize; block_xx += 1)</div><div>+; {</div><div>+; uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;</div><div>+; *ac_k += temp * temp;</div><div>+; }</div><div>+; }</div><div>+;}</div><div>+;-----------------------------------------------------------------------------------------------------------------</div><div>+; void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)</div><div>+;-----------------------------------------------------------------------------------------------------------------</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal ssimDist4, 7, 8, 8</div><div>+ mov r7d, 4</div><div>+ vpxor m4, m4 ;ssBlock</div><div>+ vpxor m3, m3</div><div>+ vpxor m7, m7 ;ac_k</div><div>+.row:</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwq m0, [r0] ;fenc</div><div>+ vpmovzxwq m1, [r2] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbq m0, [r0]</div><div>+ vpmovzxbq m1, [r2]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+ vpsrlq m6, m0, SSIMRD_SHIFT</div><div>+ vpsubq m0, m1</div><div>+ vpmuldq m0, m0, m0</div><div>+ vpmuldq m6, m6, m6</div><div>+ vpaddq m4, m0</div><div>+ vpaddq m7, m6</div><div>+</div><div>+%if HIGH_BIT_DEPTH</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+%else</div><div>+ lea r0, [r0 + r1]</div><div>+ lea r2, [r2 + r3]</div><div>+%endif</div><div>+ dec r7d</div><div>+ jnz .row</div><div>+ vextracti128 xm5, m4, 1</div><div>+ vpaddq xm4, xm5</div><div>+ punpckhqdq xm2, xm4, xm3</div><div>+ paddq xm4, xm2</div><div>+</div><div>+ vextracti128 xm5, m7, 1</div><div>+ vpaddq xm7, xm5</div><div>+ punpckhqdq xm2, xm7, xm3</div><div>+ paddq xm7, xm2</div><div>+</div><div>+ movq [r4], xm4</div><div>+ movq [r6], xm7</div><div>+ RET</div><div>+</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal ssimDist8, 7, 8, 8</div><div>+ mov r7d, 8</div><div>+ vpxor m4, m4 ;ssBlock</div><div>+ vpxor m3, m3</div><div>+ vpxor m7, m7 ;ac_k</div><div>+.row:</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0] ;fenc</div><div>+ vpmovzxwd m1, [r2] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0]</div><div>+ vpmovzxbd m1, [r2]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+%if HIGH_BIT_DEPTH</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+%else</div><div>+ lea r0, [r0 + r1]</div><div>+ lea r2, [r2 + r3]</div><div>+%endif</div><div>+ dec r7d</div><div>+ jnz .row</div><div>+ vextracti128 xm5, m4, 1</div><div>+ vpaddq xm4, xm5</div><div>+ punpckhqdq xm2, xm4, xm3</div><div>+ paddq xm4, xm2</div><div>+</div><div>+ vextracti128 xm5, m7, 1</div><div>+ vpaddq xm7, xm5</div><div>+ punpckhqdq xm2, xm7, xm3</div><div>+ paddq xm7, xm2</div><div>+</div><div>+ movq [r4], xm4</div><div>+ movq [r6], xm7</div><div>+ RET</div><div>+</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal ssimDist16, 7, 8, 8</div><div>+ mov r7d, 16</div><div>+ vpxor m4, m4 ;ssBlock</div><div>+ vpxor m3, m3</div><div>+ vpxor m7, m7 ;ac_k</div><div>+.row:</div><div>+;Col 1-8</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0] ;fenc</div><div>+ vpmovzxwd m1, [r2] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0]</div><div>+ vpmovzxbd m1, [r2]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 9-16</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 16] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 16] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 8]</div><div>+ vpmovzxbd m1, [r2 + 8]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+%if HIGH_BIT_DEPTH</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+%else</div><div>+ lea r0, [r0 + r1]</div><div>+ lea r2, [r2 + r3]</div><div>+%endif</div><div>+ dec r7d</div><div>+ jnz .row</div><div>+ vextracti128 xm5, m4, 1</div><div>+ vpaddq xm4, xm5</div><div>+ punpckhqdq xm2, xm4, xm3</div><div>+ paddq xm4, xm2</div><div>+</div><div>+ vextracti128 xm5, m7, 1</div><div>+ vpaddq xm7, xm5</div><div>+ punpckhqdq xm2, xm7, xm3</div><div>+ paddq xm7, xm2</div><div>+</div><div>+ movq [r4], xm4</div><div>+ movq [r6], xm7</div><div>+ RET</div><div>+</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal ssimDist32, 7, 8, 8 </div><div>+ mov r7d, 32</div><div>+ vpxor m4, m4 ;ssBlock</div><div>+ vpxor m3, m3</div><div>+ vpxor m7, m7 ;ac_k</div><div>+.row:</div><div>+;Col 1-8</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0] ;fenc</div><div>+ vpmovzxwd m1, [r2] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0]</div><div>+ vpmovzxbd m1, [r2]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 9-16</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 16] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 16] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 8]</div><div>+ vpmovzxbd m1, [r2 + 8]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 17-24</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 32] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 32] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 16]</div><div>+ vpmovzxbd m1, [r2 + 16]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 25-32</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 48] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 48] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 24]</div><div>+ vpmovzxbd m1, [r2 + 24]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+%if HIGH_BIT_DEPTH</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+%else</div><div>+ lea r0, [r0 + r1]</div><div>+ lea r2, [r2 + r3]</div><div>+%endif</div><div>+ dec r7d</div><div>+ jnz .row</div><div>+ vextracti128 xm5, m4, 1</div><div>+ vpaddq xm4, xm5</div><div>+ punpckhqdq xm2, xm4, xm3</div><div>+ paddq xm4, xm2</div><div>+</div><div>+ vextracti128 xm5, m7, 1</div><div>+ vpaddq xm7, xm5</div><div>+ punpckhqdq xm2, xm7, xm3</div><div>+ paddq xm7, xm2</div><div>+</div><div>+ movq [r4], xm4</div><div>+ movq [r6], xm7</div><div>+ RET</div><div>+</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal ssimDist64, 7, 8, 8 </div><div>+ mov r7d, 64</div><div>+ vpxor m4, m4 ;ssBlock</div><div>+ vpxor m3, m3</div><div>+ vpxor m7, m7 ;ac_k</div><div>+.row:</div><div>+;Col 1-8</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0] ;fenc</div><div>+ vpmovzxwd m1, [r2] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0]</div><div>+ vpmovzxbd m1, [r2]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 9-16</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 16] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 16] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 8]</div><div>+ vpmovzxbd m1, [r2 + 8]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 17-24</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 32] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 32] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 16]</div><div>+ vpmovzxbd m1, [r2 + 16]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 25-32</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 48] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 48] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 24]</div><div>+ vpmovzxbd m1, [r2 + 24]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 33-40</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 64] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 64] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 32]</div><div>+ vpmovzxbd m1, [r2 + 32]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 41-48</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 80] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 80] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 40]</div><div>+ vpmovzxbd m1, [r2 + 40]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 49-56</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 96] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 96] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 48]</div><div>+ vpmovzxbd m1, [r2 + 48]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+;Col 57-64</div><div>+%if HIGH_BIT_DEPTH</div><div>+ vpmovzxwd m0, [r0 + 112] ;fenc</div><div>+ vpmovzxwd m1, [r2 + 112] ;recon</div><div>+%elif BIT_DEPTH == 8</div><div>+ vpmovzxbd m0, [r0 + 56]</div><div>+ vpmovzxbd m1, [r2 + 56]</div><div>+%else</div><div>+ %error Unsupported BIT_DEPTH!</div><div>+%endif</div><div>+</div><div>+ SSIM_RD_COL m0, m1</div><div>+</div><div>+%if HIGH_BIT_DEPTH</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+%else</div><div>+ lea r0, [r0 + r1]</div><div>+ lea r2, [r2 + r3]</div><div>+%endif</div><div>+ dec r7d</div><div>+ jnz .row</div><div>+ vextracti128 xm5, m4, 1</div><div>+ vpaddq xm4, xm5</div><div>+ punpckhqdq xm2, xm4, xm3</div><div>+ paddq xm4, xm2</div><div>+</div><div>+ vextracti128 xm5, m7, 1</div><div>+ vpaddq xm7, xm5</div><div>+ punpckhqdq xm2, xm7, xm3</div><div>+ paddq xm7, xm2</div><div>+</div><div>+ movq [r4], xm4</div><div>+ movq [r6], xm7</div><div>+ RET</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel.h</div><div>--- a/source/common/x86/pixel.h<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/common/x86/pixel.h<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -60,7 +60,8 @@</div><div> FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \</div><div> FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \</div><div> FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \</div><div>- FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)</div><div>+ FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); \</div><div>+ FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)</div><div> </div><div> DECL_PIXELS(mmx);</div><div> DECL_PIXELS(mmx2);</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.cpp</div><div>--- a/source/test/pixelharness.cpp<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/test/pixelharness.cpp<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -2270,6 +2270,32 @@</div><div> return true;</div><div> }</div><div> </div><div>+bool PixelHarness::check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt)</div><div>+{</div><div>+ uint32_t srcStride[5] = { 4, 8, 16, 32, 64 };</div><div>+ intptr_t dstStride[5] = { 4, 8, 16, 32, 64 };</div><div>+ int shift = X265_DEPTH - 8;</div><div>+ uint64_t opt_dest1 = 0, ref_dest1 = 0, opt_dest2 = 0, ref_dest2 = 0;</div><div>+ int j = 0;</div><div>+</div><div>+ for (int i = 0; i < ITERS; i++)</div><div>+ {</div><div>+ int index = i % TEST_CASES;</div><div>+ int k1 = rand() % 5, k2 = rand() % 5;</div><div>+ ref(pixel_test_buff[index] + j, srcStride[k1], pixel_test_buff[index + 10] + j, dstStride[k2], &ref_dest1, shift, &ref_dest2);</div><div>+ opt(pixel_test_buff[index] + j, srcStride[k1], pixel_test_buff[index + 10] + j, dstStride[k2], &opt_dest1, shift, &opt_dest2);</div><div>+</div><div>+ if (opt_dest1 != ref_dest1 && opt_dest2 != ref_dest2)</div><div>+ {</div><div>+ return false;</div><div>+ }</div><div>+</div><div>+ reportfail()</div><div>+ j += INCR;</div><div>+ }</div><div>+ return true;</div><div>+}</div><div>+</div><div> bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)</div><div> {</div><div> if (opt.pu[part].satd)</div><div>@@ -2607,6 +2633,15 @@</div><div> }</div><div> }</div><div> </div><div>+ if (<a href="http://opt.cu" target="_blank">opt.cu</a>[i].ssimDist)</div><div>+ {</div><div>+ if (!check_ssimDist(<a href="http://ref.cu" target="_blank">ref.cu</a>[i].ssimDist, <a href="http://opt.cu" target="_blank">opt.cu</a>[i].ssimDist))</div><div>+ {</div><div>+ printf("\nssimDist[%dx%d] failed!\n", 4 << i, 4 << i);</div><div>+ return false;</div><div>+ }</div><div>+ }</div><div>+</div><div> if (i < BLOCK_64x64)</div><div> {</div><div> /* TU only primitives */</div><div>@@ -3093,6 +3128,7 @@</div><div> return false;</div><div> }</div><div> }</div><div>+</div><div> return true;</div><div> }</div><div> </div><div>@@ -3392,6 +3428,14 @@</div><div> HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);</div><div> REPORT_SPEEDUP(<a href="http://opt.cu" target="_blank">opt.cu</a>[i].psy_cost_pp, <a href="http://ref.cu" target="_blank">ref.cu</a>[i].psy_cost_pp, pbuf1, STRIDE, pbuf2, STRIDE);</div><div> }</div><div>+</div><div>+ if (<a href="http://opt.cu" target="_blank">opt.cu</a>[i].ssimDist)</div><div>+ {</div><div>+ uint64_t dst1 = 0, dst2 = 0;</div><div>+ int shift = X265_DEPTH - 8;</div><div>+ printf("ssimDist[%dx%d]", 4 << i, 4 << i);</div><div>+ REPORT_SPEEDUP(<a href="http://opt.cu" target="_blank">opt.cu</a>[i].ssimDist, <a href="http://ref.cu" target="_blank">ref.cu</a>[i].ssimDist, pixel_test_buff[0], 32, pixel_test_buff[5], 64, &dst1, shift, &dst2);</div><div>+ }</div><div> }</div><div> </div><div> if (opt.weight_pp)</div><div>diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.h</div><div>--- a/source/test/pixelharness.h<span style="white-space:pre-wrap"> </span>Tue Feb 19 20:20:35 2019 +0530</div><div>+++ b/source/test/pixelharness.h<span style="white-space:pre-wrap"> </span>Wed Feb 27 12:35:02 2019 +0530</div><div>@@ -136,6 +136,7 @@</div><div> bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);</div><div> bool check_integral_initv(integralv_t ref, integralv_t opt);</div><div> bool check_integral_inith(integralh_t ref, integralh_t opt);</div><div>+ bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);</div><div> </div><div> public:</div><div> </div></div><div><br></div>-- <br><div dir="ltr" class="gmail-m_7362206174838744524gmail_signature"><div dir="ltr"><i><b>Regards,</b></i><div><i><b>Akil R</b></i></div></div></div></div></div>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div></div>