[x265] [PATCH x265] Add AVX2 assembly code for normFactor primitive.
Dinesh Kumar Reddy
dinesh at multicorewareinc.com
Thu Mar 7 10:36:19 CET 2019
# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1551693998 -19800
# Mon Mar 04 15:36:38 2019 +0530
# Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4
# Parent d12a4caf7963fd47d646040689ad5f02754ad879
x86: normFactor primitive
This patch adds AVX2 assembly for this primitive.
Pushed to default branch of x265 repo
Thanks & Regards,
Dinesh
On Tue, Mar 5, 2019 at 10:04 AM Akil <akil at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Akil Ayyappan<akil at multicorewareinc.com>
> # Date 1551693998 -19800
> # Mon Mar 04 15:36:38 2019 +0530
> # Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4
> # Parent d12a4caf7963fd47d646040689ad5f02754ad879
> x86: normFactor primitive
>
> This patch adds AVX2 assembly for this primitive.
>
> |---------|-----------|-----------------|-----------------|
> | Size |Performance|AVX2 clock cycles|CPP clock cycles |
> |---------|-----------|-----------------|-----------------|
> | [8x8] | 7.65x | 312.90 | 2394.83 |
> | [16x16] | 8.42x | 1157.14 | 9741.56 |
> | [32x32] | 9.56x | 3942.18 | 37692.20 |
> | [64x64] | 8.96x | 15388.24 | 137889.28 |
> |---------|-----------|-----------------|-----------------|
>
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/pixel.cpp
> --- a/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/common/pixel.cpp Mon Mar 04 15:36:38 2019 +0530
> @@ -959,6 +959,19 @@
> }
> }
>
> +static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
> uint64_t *z_k)
> +{
> + *z_k = 0;
> + for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
> + {
> + for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
> + {
> + uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
> + *z_k += temp * temp;
> + }
> + }
> +}
> +
> #if HIGH_BIT_DEPTH
> static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width,
> int height, uint64_t *outsum,
> const pixel minPix, const pixel maxPix)
> @@ -1314,5 +1327,10 @@
> p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
> p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
> p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
> +
> + p.cu[BLOCK_8x8].normFact = normFact_c;
> + p.cu[BLOCK_16x16].normFact = normFact_c;
> + p.cu[BLOCK_32x32].normFact = normFact_c;
> + p.cu[BLOCK_64x64].normFact = normFact_c;
> }
> }
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/primitives.h
> --- a/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/common/primitives.h Mon Mar 04 15:36:38 2019 +0530
> @@ -228,6 +228,7 @@
> typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
> *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
> blkPos);
> typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
> *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
> *totalRdCost, int64_t *psyScale, uint32_t blkPos);
> typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride,
> const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift,
> uint64_t *ac_k);
> +typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int
> shift, uint64_t *z_k);
> /* Function pointers to optimized encoder primitives. Each pointer can
> reference
> * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> struct EncoderPrimitives
> @@ -305,6 +306,7 @@
> psyRdoQuant_t1 psyRdoQuant_1p;
> psyRdoQuant_t2 psyRdoQuant_2p;
> ssimDistortion_t ssimDist;
> + normFactor_t normFact;
> }
> cu[NUM_CU_SIZES];
> /* These remaining primitives work on either fixed block sizes or take
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/common/x86/asm-primitives.cpp Mon Mar 04 15:36:38 2019 +0530
> @@ -2325,6 +2325,11 @@
> p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
> p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
>
> + p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
> + p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
> + p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
> + p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
> +
> /* TODO: This kernel needs to be modified to work with
> HIGH_BIT_DEPTH only
> p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
>
> @@ -4718,6 +4723,11 @@
> p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
> p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
>
> + p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
> + p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
> + p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
> + p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
> +
> }
> if (cpuMask & X265_CPU_AVX512)
> {
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/common/x86/pixel-a.asm Mon Mar 04 15:36:38 2019 +0530
> @@ -388,6 +388,16 @@
> vpaddq m7, m6
> %endmacro
>
> +%macro NORM_FACT_COL 1
> + vpsrld m1, m0, SSIMRD_SHIFT
> + vpmuldq m2, m1, m1
> + vpsrldq m1, m1, 4
> + vpmuldq m1, m1, m1
> +
> + vpaddq m1, m2
> + vpaddq m3, m1
> +%endmacro
> +
> ; FIXME avoid the spilling of regs to hold 3*stride.
> ; for small blocks on x86_32, modify pixel pointer instead.
>
> @@ -16303,3 +16313,266 @@
> movq [r4], xm4
> movq [r6], xm7
> RET
> +
> +
> +;static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
> uint64_t *z_k)
> +;{
> +; *z_k = 0;
> +; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
> +; {
> +; for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
> +; {
> +; uint32_t temp = src[block_yy * blockSize + block_xx] >>
> shift;
> +; *z_k += temp * temp;
> +; }
> +; }
> +;}
>
> +;--------------------------------------------------------------------------------------
> +; void normFact_c(const pixel* src, uint32_t blockSize, int shift,
> uint64_t *z_k)
>
> +;--------------------------------------------------------------------------------------
> +INIT_YMM avx2
> +cglobal normFact8, 4, 5, 6
> + mov r4d, 8
> + vpxor m3, m3 ;z_k
> + vpxor m5, m5
> +.row:
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> +%else
> + lea r0, [r0 + r1]
> +%endif
> + dec r4d
> + jnz .row
> + vextracti128 xm4, m3, 1
> + vpaddq xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddq xm3, xm2
> + movq [r3], xm3
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal normFact16, 4, 5, 6
> + mov r4d, 16
> + vpxor m3, m3 ;z_k
> + vpxor m5, m5
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> +%else
> + lea r0, [r0 + r1]
> +%endif
> + dec r4d
> + jnz .row
> + vextracti128 xm4, m3, 1
> + vpaddq xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddq xm3, xm2
> + movq [r3], xm3
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal normFact32, 4, 5, 6
> + mov r4d, 32
> + vpxor m3, m3 ;z_k
> + vpxor m5, m5
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 17-24
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 32] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 16]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 25-32
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 24]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> +%else
> + lea r0, [r0 + r1]
> +%endif
> + dec r4d
> + jnz .row
> + vextracti128 xm4, m3, 1
> + vpaddq xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddq xm3, xm2
> + movq [r3], xm3
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal normFact64, 4, 5, 6
> + mov r4d, 64
> + vpxor m3, m3 ;z_k
> + vpxor m5, m5
> +.row:
> +;Col 1-8
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 9-16
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 8]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 17-24
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 32] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 16]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 25-32
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 24]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 33-40
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 64] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 32]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 41-48
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 80] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 40]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 49-56
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 96] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 48]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +;Col 57-64
> +%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 112] ;src
> +%elif BIT_DEPTH == 8
> + vpmovzxbd m0, [r0 + 56]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> + NORM_FACT_COL m0
> +
> +%if HIGH_BIT_DEPTH
> + lea r0, [r0 + 2 * r1]
> +%else
> + lea r0, [r0 + r1]
> +%endif
> + dec r4d
> + jnz .row
> + vextracti128 xm4, m3, 1
> + vpaddq xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddq xm3, xm2
> + movq [r3], xm3
> + RET
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/common/x86/pixel.h Mon Mar 04 15:36:38 2019 +0530
> @@ -61,7 +61,8 @@
> FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
> FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t
> sstride, const pixel* recon, intptr_t rstride); \
> FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
> sstride, const int16_t* recon, intptr_t rstride); \
> - FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
> fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
> shift, uint64_t *ac_k)
> + FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
> fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
> shift, uint64_t *ac_k); \
> + FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t
> blockSize, int shift, uint64_t *z_k)
>
> DECL_PIXELS(mmx);
> DECL_PIXELS(mmx2);
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/encoder/analysis.cpp Mon Mar 04 15:36:38 2019 +0530
> @@ -3696,14 +3696,8 @@
>
> // 2. Calculate ac component
> uint64_t z_k = 0;
> - for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
> - {
> - for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
> - {
> - uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
> - z_k += temp * temp;
> - }
> - }
> + int block = (int)((log(blockSize) / log(2)) - 2);
> + primitives.cu[block].normFact(src, blockSize, shift, &z_k);
>
> // Remove the DC part
> z_k -= z_o;
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/test/pixelharness.cpp Mon Mar 04 15:36:38 2019 +0530
> @@ -2296,6 +2296,30 @@
> return true;
> }
>
> +bool PixelHarness::check_normFact(normFactor_t ref, normFactor_t opt, int
> block)
> +{
> + int shift = X265_DEPTH - 8;
> + uint64_t opt_dest = 0, ref_dest = 0;
> + int j = 0;
> + int blockSize = 4 << block;
> +
> + for (int i = 0; i < ITERS; i++)
> + {
> + int index = i % TEST_CASES;
> + ref(pixel_test_buff[index] + j, blockSize, shift, &ref_dest);
> + opt(pixel_test_buff[index] + j, blockSize, shift, &opt_dest);
> +
> + if (opt_dest != ref_dest)
> + {
> + return false;
> + }
> +
> + reportfail()
> + j += INCR;
> + }
> + return true;
> +}
> +
> bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
> {
> if (opt.pu[part].satd)
> @@ -3129,6 +3153,18 @@
> }
> }
>
> + for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
> + {
> + if (opt.cu[i].normFact)
> + {
> + if (!check_normFact(ref.cu[i].normFact, opt.cu[i].normFact,
> i))
> + {
> + printf("\nnormFact[%dx%d] failed!\n", 4 << i, 4 << i);
> + return false;
> + }
> + }
> + }
> +
> return true;
> }
>
> @@ -3769,4 +3805,16 @@
> REPORT_SPEEDUP(opt.integral_inith[k], ref.integral_inith[k],
> dst_buf, pbuf1, STRIDE);
> }
> }
> +
> + for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
> + {
> + if (opt.cu[i].normFact)
> + {
> + uint64_t dst = 0;
> + int blockSize = 4 << i;
> + int shift = X265_DEPTH - 8;
> + printf("normFact[%dx%d]", blockSize, blockSize);
> + REPORT_SPEEDUP(opt.cu[i].normFact, ref.cu[i].normFact,
> pixel_test_buff[0], blockSize, shift, &dst);
> + }
> + }
> }
> diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.h
> --- a/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530
> +++ b/source/test/pixelharness.h Mon Mar 04 15:36:38 2019 +0530
> @@ -137,6 +137,7 @@
> bool check_integral_initv(integralv_t ref, integralv_t opt);
> bool check_integral_inith(integralh_t ref, integralh_t opt);
> bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
> + bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
>
> public:
>
>
>
> --
> *Regards,*
> *Akil R*
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190307/a6344493/attachment-0001.html>
More information about the x265-devel
mailing list