[x265] Fwd: [PATCH] asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Thu Apr 16 12:52:23 CEST 2015
Thanks Praveen. Will correct the message and resend.
On Thu, Apr 16, 2015 at 4:06 PM, Praveen Tiwari <
praveen at multicorewareinc.com> wrote:
>
> Regards,
> Praveen
>
> ---------- Forwarded message ----------
> From: <dnyaneshwar at multicorewareinc.com>
> Date: Thu, Apr 16, 2015 at 3:43 PM
> Subject: [x265] [PATCH] asm: avx2 code for satd_48x64 and 64xN, improved
> over ~100% than SSE
> To: x265-devel at videolan.org
>
>
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1429173485 -19800
> # Thu Apr 16 14:08:05 2015 +0530
> # Node ID 04e7526a8bde9e46867f5c4cfb63b98409c7fb44
> # Parent ebca2a0d3ab905b62c346d5d0b23d50c618d5827
> asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE
>
> AVX2:
> satd[48x64] 12.52x 7696.91 96366.03
> satd[64x48] 12.16x 8103.43 98564.64
> satd[64x16] 12.15x 2759.65 33537.19
> satd[64x32] 12.12x 5372.52 65090.38
> satd[64x64] 13.02x 10260.38 133615.69
>
> SSE:
> satd[48x64] 5.32x 18146.13 96505.38
> satd[64x48] 5.33x 18201.03 96975.23
> satd[64x16] 5.21x 6272.14 32651.24
> satd[64x32] 5.42x 11910.58 64529.81
> satd[64x64] 5.30x 26665.73 141387.59
>
>
> Please, correct the commit messages 100% improvement means your new code
> is taking zero cycles. % improvement is calculated as below
>
> (old cycles - new cycles) * 100 / old cycles.
>
> so 48x64, (18146 - 7696) * 100 / 18146 = ~57%
>
> Please do correct previous commit messages also if you have done same
> thing.
>
>
>
>
> diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Thu Apr 16 12:22:53 2015
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Thu Apr 16 14:08:05 2015
> +0530
> @@ -1707,6 +1707,11 @@
> p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_avx2;
> p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_avx2;
> p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_avx2;
> + p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
> + p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;
> + p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_avx2;
> + p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_avx2;
> + p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_avx2;
>
> p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
> p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
> diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Thu Apr 16 12:22:53 2015 +0530
> +++ b/source/common/x86/pixel-a.asm Thu Apr 16 14:08:05 2015 +0530
> @@ -10903,4 +10903,279 @@
> movd eax, xm0
> RET
>
> +cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
> + mova m7, [hmul_16p]
> + lea r4, [3 * r1]
> + lea r5, [3 * r3]
> + pxor m6, m6
> + mov r6, r0
> + mov r7, r2
> +
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + mova m9, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m9, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + mova m8, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m8, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> +
> + pmaddwd m6, [pw_1]
> + vextracti128 xm2, m9, 1
> + vextracti128 xm1, m8, 1
> + vextracti128 xm0, m6, 1
> + paddd xm2, xm9
> + paddd xm1, xm8
> + paddd xm0, xm6
> + paddd xm0, xm1
> + paddd xm0, xm2
> + movhlps xm7, xm0
> + paddd xm0, xm7
> + pshuflw xm7, xm0, q0032
> + paddd xm0, xm7
> + movd eax, xm0
> + RET
> +
> +cglobal pixel_satd_64x16, 4,8,8 ; if WIN64 && cpuflag(avx2)
> + mova m7, [hmul_16p]
> + lea r4, [3 * r1]
> + lea r5, [3 * r3]
> + pxor m6, m6
> + mov r6, r0
> + mov r7, r2
> +
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call calc_satd_16x8
> + call calc_satd_16x8
> +
> + vextracti128 xm0, m6, 1
> + paddw xm0, xm6
> + pmaddwd xm0, [pw_1]
> + movhlps xm7, xm0
> + paddd xm0, xm7
> + pshuflw xm7, xm0, q0032
> + paddd xm0, xm7
> + movd eax, xm0
> + RET
> +
> +cglobal pixel_satd_64x32, 4,8,9 ; if WIN64 && cpuflag(avx2)
> + mova m7, [hmul_16p]
> + lea r4, [3 * r1]
> + lea r5, [3 * r3]
> + pxor m6, m6
> + mov r6, r0
> + mov r7, r2
> +
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + mova m8, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m8, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> +
> + pmaddwd m6, [pw_1]
> + vextracti128 xm1, m8, 1
> + vextracti128 xm0, m6, 1
> + paddd xm1, xm8
> + paddd xm0, xm6
> + paddd xm0, xm1
> + movhlps xm7, xm0
> + paddd xm0, xm7
> + pshuflw xm7, xm0, q0032
> + paddd xm0, xm7
> + movd eax, xm0
> + RET
> +
> +cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2)
> + mova m7, [hmul_16p]
> + lea r4, [3 * r1]
> + lea r5, [3 * r3]
> + pxor m6, m6
> + mov r6, r0
> + mov r7, r2
> +
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + mova m8, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m8, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + mova m9, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m9, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> +
> + pmaddwd m6, [pw_1]
> + vextracti128 xm2, m9, 1
> + vextracti128 xm1, m8, 1
> + vextracti128 xm0, m6, 1
> + paddd xm2, xm9
> + paddd xm1, xm8
> + paddd xm0, xm6
> + paddd xm0, xm2
> + paddd xm0, xm1
> + movhlps xm7, xm0
> + paddd xm0, xm7
> + pshuflw xm7, xm0, q0032
> + paddd xm0, xm7
> + movd eax, xm0
> + RET
> +
> +cglobal pixel_satd_64x64, 4,8,11 ; if WIN64 && cpuflag(avx2)
> + mova m7, [hmul_16p]
> + lea r4, [3 * r1]
> + lea r5, [3 * r3]
> + pxor m6, m6
> + mov r6, r0
> + mov r7, r2
> +
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 16]
> + lea r2, [r7 + 16]
> + mova m10, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m10, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 32]
> + lea r2, [r7 + 32]
> + mova m9, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m9, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + lea r0, [r6 + 48]
> + lea r2, [r7 + 48]
> + mova m8, m6 ; to avoid overflow, move to
> another register
> + pxor m6, m6
> + pmaddwd m8, [pw_1]
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> + call calc_satd_16x8
> +
> + pmaddwd m6, [pw_1]
> + vextracti128 xm3, m10, 1
> + vextracti128 xm2, m9, 1
> + vextracti128 xm1, m8, 1
> + vextracti128 xm0, m6, 1
> + paddd xm3, xm10
> + paddd xm2, xm9
> + paddd xm1, xm8
> + paddd xm0, xm6
> + paddd xm0, xm3
> + paddd xm0, xm2
> + paddd xm0, xm1
> + movhlps xm7, xm0
> + paddd xm0, xm7
> + pshuflw xm7, xm0, q0032
> + paddd xm0, xm7
> + movd eax, xm0
> + RET
> +
> %endif ; if ARCH_X86_64 == 1
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150416/98265a1d/attachment-0001.html>
More information about the x265-devel
mailing list