[x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.
Dinesh Kumar Reddy
dinesh at multicorewareinc.com
Thu Mar 7 10:45:06 CET 2019
# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1551251102 -19800
# Wed Feb 27 12:35:02 2019 +0530
# Node ID d12a4caf7963fd47d646040689ad5f02754ad879
# Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
x86: ssimDistortion primitive
This patch adds AVX2 assembly for this primitive.
Pushed patch to default branch of x265 repo.
Thanks & Regards,
Dinesh
On Tue, Mar 5, 2019 at 10:03 AM Akil <akil at multicorewareinc.com> wrote:
>
> # HG changeset patch
> # User Akil Ayyappan<akil at multicorewareinc.com>
> # Date 1551251102 -19800
> # Wed Feb 27 12:35:02 2019 +0530
> # Node ID d12a4caf7963fd47d646040689ad5f02754ad879
> # Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
> x86: ssimDistortion primitive
>
> This patch adds AVX2 assembly for this primitive.
>
> |---------|-----------|-----------------|-----------------|
> | Size |Performance|AVX2 clock cycles|CPP clock cycles |
> |---------|-----------|-----------------|-----------------|
> | [4x4] | 3.52x | 264.43 | 932.05 |
> | [8x8] | 5.11x | 619.24 | 3163.56 |
> | [16x16] | 5.44x | 2114.00 | 11490.52 |
> | [32x32] | 6.01x | 7589.70 | 45608.01 |
> | [64x64] | 6.70x | 27859.21 | 186634.25 |
> |---------|-----------|-----------------|-----------------|
>
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -934,6 +934,31 @@
> }
> }
>
> +template<int log2TrSize>
> +static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
> recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
> +{
> + *ssBlock = 0;
> + const uint32_t trSize = 1 << log2TrSize;
> + for (int y = 0; y < trSize; y++)
> + {
> + for (int x = 0; x < trSize; x++)
> + {
> + int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
> copy of residual coeff
> + *ssBlock += temp * temp;
> + }
> + }
> +
> + *ac_k = 0;
> + for (int block_yy = 0; block_yy < trSize; block_yy += 1)
> + {
> + for (int block_xx = 0; block_xx < trSize; block_xx += 1)
> + {
> + uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
> + *ac_k += temp * temp;
> + }
> + }
> +}
> +
> #if HIGH_BIT_DEPTH
> static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width,
> int height, uint64_t *outsum,
> const pixel minPix, const pixel maxPix)
> @@ -1283,5 +1308,11 @@
> p.propagateCost = estimateCUPropagateCost;
> p.fix8Unpack = cuTreeFix8Unpack;
> p.fix8Pack = cuTreeFix8Pack;
> +
> + p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>;
> + p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>;
> + p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
> + p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
> + p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
> }
> }
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h
> --- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
> @@ -227,6 +227,7 @@
> typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t
> *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
> *totalRdCost, int64_t *psyScale, uint32_t blkPos);
> typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
> *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
> blkPos);
> typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
> *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
> *totalRdCost, int64_t *psyScale, uint32_t blkPos);
> +typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride,
> const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift,
> uint64_t *ac_k);
> /* Function pointers to optimized encoder primitives. Each pointer can
> reference
> * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> struct EncoderPrimitives
> @@ -303,6 +304,7 @@
> psyRdoQuant_t psyRdoQuant;
> psyRdoQuant_t1 psyRdoQuant_1p;
> psyRdoQuant_t2 psyRdoQuant_2p;
> + ssimDistortion_t ssimDist;
> }
> cu[NUM_CU_SIZES];
> /* These remaining primitives work on either fixed block sizes or take
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp
> --- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -501,15 +501,8 @@
>
> // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
> ssBlock = 0;
> - for (int y = 0; y < trSize; y++)
> - {
> - for (int x = 0; x < trSize; x++)
> - {
> - int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
> copy of residual coeff
> - ssBlock += temp * temp;
> - }
> - }
> -
> + uint64_t ac_k = 0;
> + primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon,
> rstride, &ssBlock, shift, &ac_k);
> ssAc = ssBlock - ssDc;
>
> // 1. Calculation of fdc'
> @@ -535,15 +528,6 @@
> uint64_t fAc_num = 0;
>
> // 2. Calculate ac component
> - uint64_t ac_k = 0;
> - for (int block_yy = 0; block_yy < trSize; block_yy += 1)
> - {
> - for (int block_xx = 0; block_xx < trSize; block_xx += 1)
> - {
> - uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
> - ac_k += temp * temp;
> - }
> - }
> ac_k -= dc_k;
>
> double s = 1 + 0.005 * cu.m_qp[absPartIdx];
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -2319,6 +2319,12 @@
> p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
> p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
>
> + p.cu[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);
> + p.cu[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);
> + p.cu[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);
> + p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
> + p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
> +
> /* TODO: This kernel needs to be modified to work with
> HIGH_BIT_DEPTH only
> p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
>
> @@ -4706,6 +4712,12 @@
> p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
> p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
>
> + p.cu[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);
> + p.cu[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);
> + p.cu[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);
> + p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
> + p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
> +
> }
> if (cpuMask & X265_CPU_AVX512)
> {
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530
> @@ -73,6 +73,16 @@
> cextern pb_movemask_32
> cextern pw_pixel_max
>
> +%if BIT_DEPTH == 12
> + %define SSIMRD_SHIFT 4
> +%elif BIT_DEPTH == 10
> + %define SSIMRD_SHIFT 2
> +%elif BIT_DEPTH == 8
> + %define SSIMRD_SHIFT 0
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
>
> ;=============================================================================
> ; SATD
>
> ;=============================================================================
> @@ -360,6 +370,24 @@
> RET
> %endmacro
>
> +%macro SSIM_RD_COL 2
> + vpsrld m6, m0, SSIMRD_SHIFT
> + vpsubd m0, m1
> +
> + vpmuldq m2, m0, m0
> + vpsrldq m0, m0, 4
> + vpmuldq m0, m0, m0
> + vpaddq m0, m2
> +
> + vpmuldq m2, m6, m6
> + vpsrldq m6, m6, 4
> + vpmuldq m6, m6, m6
> + vpaddq m6, m2
> +
> + vpaddq m4, m0
> + vpaddq m7, m6
> +%endmacro
> +
> ; FIXME avoid the spilling of regs to hold 3*stride.
> ; for small blocks on x86_32, modify pixel pointer instead.
>
> @@ -15883,3 +15911,395 @@
> RET
> %endif
> %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
> +
> +;template<int log2TrSize>
> +;static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
> recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
> +;{
> +; *ssBlock = 0;
> +; const uint32_t trSize = 1 << log2TrSize;
> +; for (int y = 0; y < trSize; y++)
> +; {
> +; for (int x = 0; x < trSize; x++)
> +; {
> +; int temp = fenc[y * fStride + x] - recon[y * rstride + x];
> // copy of residual coeff
> +; *ssBlock += temp * temp;
> +; }
> +; }
> +;
> +; *ac_k = 0;
> +; for (int block_yy = 0; block_yy < trSize; block_yy += 1)
> +; {
> +; for (int block_xx = 0; block_xx < trSize; block_xx += 1)
> +; {
> +; uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
> +; *ac_k += temp * temp;
> +; }
> +; }
> +;}
>
> +;-----------------------------------------------------------------------------------------------------------------
> +; void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
> recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
>
> +;-----------------------------------------------------------------------------------------------------------------
> +
> +INIT_YMM avx2
> +cglobal ssimDist4, 7, 8, 8
> + mov r7d, 4
> + vpxor m4, m4 ;ssBlock
> + vpxor m3, m3
> + vpxor m7, m7 ;ac_k
> +.row:
> +%if HIGH_BIT_DEPTH
> + vpmovzxwq m0, [r0] ;fenc
> + vpmovzxwq m1, [r2] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbq m0, [r0]
> + vpmovzxbq m1, [r2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> + vpsrlq m6, m0, SSIMRD_SHIFT
> + vpsubq m0, m1
> + vpmuldq m0, m0, m0
> + vpmuldq m6, m6, m6
> + vpaddq m4, m0
> + vpaddq m7, m6
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +%else
> + lea r0, [r0 + r1]
> + lea r2, [r2 + r3]
> +%endif
> + dec r7d
> + jnz .row
> + vextracti128 xm5, m4, 1
> + vpaddq xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddq xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddq xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddq xm7, xm2
> +
> + movq [r4], xm4
> + movq [r6], xm7
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal ssimDist8, 7, 8, 8
> + mov r7d, 8
> + vpxor m4, m4 ;ssBlock
> + vpxor m3, m3
> + vpxor m7, m7 ;ac_k
> +.row:
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;fenc
> + vpmovzxwd m1, [r2] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> + vpmovzxbd m1, [r2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +%else
> + lea r0, [r0 + r1]
> + lea r2, [r2 + r3]
> +%endif
> + dec r7d
> + jnz .row
> + vextracti128 xm5, m4, 1
> + vpaddq xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddq xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddq xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddq xm7, xm2
> +
> + movq [r4], xm4
> + movq [r6], xm7
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal ssimDist16, 7, 8, 8
> + mov r7d, 16
> + vpxor m4, m4 ;ssBlock
> + vpxor m3, m3
> + vpxor m7, m7 ;ac_k
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;fenc
> + vpmovzxwd m1, [r2] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> + vpmovzxbd m1, [r2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;fenc
> + vpmovzxwd m1, [r2 + 16] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> + vpmovzxbd m1, [r2 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +%else
> + lea r0, [r0 + r1]
> + lea r2, [r2 + r3]
> +%endif
> + dec r7d
> + jnz .row
> + vextracti128 xm5, m4, 1
> + vpaddq xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddq xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddq xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddq xm7, xm2
> +
> + movq [r4], xm4
> + movq [r6], xm7
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal ssimDist32, 7, 8, 8
> + mov r7d, 32
> + vpxor m4, m4 ;ssBlock
> + vpxor m3, m3
> + vpxor m7, m7 ;ac_k
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;fenc
> + vpmovzxwd m1, [r2] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> + vpmovzxbd m1, [r2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;fenc
> + vpmovzxwd m1, [r2 + 16] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> + vpmovzxbd m1, [r2 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 17-24
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 32] ;fenc
> + vpmovzxwd m1, [r2 + 32] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 16]
> + vpmovzxbd m1, [r2 + 16]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 25-32
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48] ;fenc
> + vpmovzxwd m1, [r2 + 48] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 24]
> + vpmovzxbd m1, [r2 + 24]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +%else
> + lea r0, [r0 + r1]
> + lea r2, [r2 + r3]
> +%endif
> + dec r7d
> + jnz .row
> + vextracti128 xm5, m4, 1
> + vpaddq xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddq xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddq xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddq xm7, xm2
> +
> + movq [r4], xm4
> + movq [r6], xm7
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal ssimDist64, 7, 8, 8
> + mov r7d, 64
> + vpxor m4, m4 ;ssBlock
> + vpxor m3, m3
> + vpxor m7, m7 ;ac_k
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;fenc
> + vpmovzxwd m1, [r2] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> + vpmovzxbd m1, [r2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;fenc
> + vpmovzxwd m1, [r2 + 16] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> + vpmovzxbd m1, [r2 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 17-24
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 32] ;fenc
> + vpmovzxwd m1, [r2 + 32] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 16]
> + vpmovzxbd m1, [r2 + 16]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 25-32
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48] ;fenc
> + vpmovzxwd m1, [r2 + 48] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 24]
> + vpmovzxbd m1, [r2 + 24]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 33-40
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 64] ;fenc
> + vpmovzxwd m1, [r2 + 64] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 32]
> + vpmovzxbd m1, [r2 + 32]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 41-48
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 80] ;fenc
> + vpmovzxwd m1, [r2 + 80] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 40]
> + vpmovzxbd m1, [r2 + 40]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 49-56
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 96] ;fenc
> + vpmovzxwd m1, [r2 + 96] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 48]
> + vpmovzxbd m1, [r2 + 48]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +;Col 57-64
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 112] ;fenc
> + vpmovzxwd m1, [r2 + 112] ;recon
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 56]
> + vpmovzxbd m1, [r2 + 56]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + SSIM_RD_COL m0, m1
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> + lea r2, [r2 + 2 * r3]
> +%else
> + lea r0, [r0 + r1]
> + lea r2, [r2 + r3]
> +%endif
> + dec r7d
> + jnz .row
> + vextracti128 xm5, m4, 1
> + vpaddq xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddq xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddq xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddq xm7, xm2
> +
> + movq [r4], xm4
> + movq [r6], xm7
> + RET
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530
> @@ -60,7 +60,8 @@
> FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*,
> intptr_t); \
> FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
> FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t
> sstride, const pixel* recon, intptr_t rstride); \
> - FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
> sstride, const int16_t* recon, intptr_t rstride)
> + FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
> sstride, const int16_t* recon, intptr_t rstride); \
> + FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
> fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
> shift, uint64_t *ac_k)
>
> DECL_PIXELS(mmx);
> DECL_PIXELS(mmx2);
> diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -2270,6 +2270,32 @@
> return true;
> }
>
> +bool PixelHarness::check_ssimDist(ssimDistortion_t ref, ssimDistortion_t
> opt)
> +{
> + uint32_t srcStride[5] = { 4, 8, 16, 32, 64 };
> + intptr_t dstStride[5] = { 4, 8, 16, 32, 64 };
> + int shift = X265_DEPTH - 8;
> + uint64_t opt_dest1 = 0, ref_dest1 = 0, opt_dest2 = 0, ref_dest2 = 0;
> + int j = 0;
> +
> + for (int i = 0; i < ITERS; i++)
> + {
> + int index = i % TEST_CASES;
> + int k1 = rand() % 5, k2 = rand() % 5;
> + ref(pixel_test_buff[index] + j, srcStride[k1],
> pixel_test_buff[index + 10] + j, dstStride[k2], &ref_dest1, shift,
> &ref_dest2);
> + opt(pixel_test_buff[index] + j, srcStride[k1],
> pixel_test_buff[index + 10] + j, dstStride[k2], &opt_dest1, shift,
> &opt_dest2);
> +
> + if (opt_dest1 != ref_dest1 && opt_dest2 != ref_dest2)
> + {
> + return false;
> + }
> +
> + reportfail()
> + j += INCR;
> + }
> + return true;
> +}
> +
> bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
> {
> if (opt.pu[part].satd)
> @@ -2607,6 +2633,15 @@
> }
> }
>
> + if (opt.cu[i].ssimDist)
> + {
> + if (!check_ssimDist(ref.cu[i].ssimDist, opt.cu[i].ssimDist))
> + {
> + printf("\nssimDist[%dx%d] failed!\n", 4 << i, 4 << i);
> + return false;
> + }
> + }
> +
> if (i < BLOCK_64x64)
> {
> /* TU only primitives */
> @@ -3093,6 +3128,7 @@
> return false;
> }
> }
> +
> return true;
> }
>
> @@ -3392,6 +3428,14 @@
> HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
> REPORT_SPEEDUP(opt.cu[i].psy_cost_pp, ref.cu[i].psy_cost_pp,
> pbuf1, STRIDE, pbuf2, STRIDE);
> }
> +
> + if (opt.cu[i].ssimDist)
> + {
> + uint64_t dst1 = 0, dst2 = 0;
> + int shift = X265_DEPTH - 8;
> + printf("ssimDist[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.cu[i].ssimDist, ref.cu[i].ssimDist,
> pixel_test_buff[0], 32, pixel_test_buff[5], 64, &dst1, shift, &dst2);
> + }
> }
>
> if (opt.weight_pp)
> diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.h
> --- a/source/test/pixelharness.h Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530
> @@ -136,6 +136,7 @@
> bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t
> opt);
> bool check_integral_initv(integralv_t ref, integralv_t opt);
> bool check_integral_inith(integralh_t ref, integralh_t opt);
> + bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
>
> public:
>
>
> --
> *Regards,*
> *Akil R*
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190307/fb926ca0/attachment-0001.html>
More information about the x265-devel
mailing list