[x265] [PATCH] asm: disabled 10bpp AVX primitives having less than 3% speed up over SSE
Aasaipriya Chandran
aasaipriya at multicorewareinc.com
Tue Aug 18 08:25:33 CEST 2015
Kindly don't consider this patch. I will merge both AVX and AVX2
primitives(to be disabled) and will resend .
Apologies for the confusion.
Thanks,
Aasaipriya
On Mon, Aug 17, 2015 at 11:57 AM, <aasaipriya at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
> # Date 1439792797 -19800
> # Mon Aug 17 11:56:37 2015 +0530
> # Node ID 6364dbfbf62a504d2899831ed8832ff7d5f2fa91
> # Parent d56b2466c04459205287e1581d8a36eebf372ba6
> asm: disabled 10bpp AVX primitives having less than 3% speed up over SSE
>
> these AVX primitives are slower than SSE primitives
>
> diff -r d56b2466c044 -r 6364dbfbf62a source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Aug 12 18:12:20 2015
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Mon Aug 17 11:56:37 2015
> +0530
> @@ -1169,7 +1169,6 @@
> }
> if (cpuMask & X265_CPU_AVX)
> {
> - // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d =
> PFX(pixel_satd_4x4_avx); fails tests
> p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd =
> PFX(pixel_satd_16x24_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd =
> PFX(pixel_satd_32x48_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd =
> PFX(pixel_satd_24x64_avx);
> @@ -1177,32 +1176,40 @@
> p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd =
> PFX(pixel_satd_8x12_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd =
> PFX(pixel_satd_12x32_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd =
> PFX(pixel_satd_4x32_avx);
> - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd =
> PFX(pixel_satd_4x8_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd =
> PFX(pixel_satd_8x16_avx);
> - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd =
> PFX(pixel_satd_4x4_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd =
> PFX(pixel_satd_8x8_avx);
> - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd =
> PFX(pixel_satd_4x16_avx);
> p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd =
> PFX(pixel_satd_8x32_avx);
> - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
> PFX(pixel_satd_8x4_avx);
> -
> - ALL_LUMA_PU(satd, pixel_satd, avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd =
> PFX(pixel_satd_8x8_avx);
> - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
> PFX(pixel_satd_8x4_avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd =
> PFX(pixel_satd_8x16_avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd =
> PFX(pixel_satd_8x32_avx);
> - p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd =
> PFX(pixel_satd_12x16_avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd =
> PFX(pixel_satd_24x32_avx);
> - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
> PFX(pixel_satd_4x16_avx);
> - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
> PFX(pixel_satd_4x8_avx);
> -#if X265_DEPTH <= 10
> - ASSIGN_SA8D(avx);
> -#endif
> +
> + p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx);
> + p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);
> + p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);
> + p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);
> + p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx);
> + p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx);
> + p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);
> + p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);
> + p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx);
> + p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);
> + p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);
> + p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx);
> + p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);
> + p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);
> + p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx);
> + p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx);
> + p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);
> + p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);
> + p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);
> + p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);
> +
> p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d =
> PFX(pixel_sa8d_8x8_avx);
> p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d =
> PFX(pixel_sa8d_16x16_avx);
> p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d =
> PFX(pixel_sa8d_32x32_avx);
> - LUMA_VAR(avx);
> - p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
> - p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
> +
> + p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);
>
> // copy_pp primitives
> // 16 x N
> @@ -1299,6 +1306,30 @@
> p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
> p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
> p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
> +
> + /* The following primitives have been disabled since performance
> compared to SSE4 is negligible/negative */
> +#if 0
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd =
> PFX(pixel_satd_4x8_avx);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
> PFX(pixel_satd_4x8_avx);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd =
> PFX(pixel_satd_4x4_avx);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd =
> PFX(pixel_satd_4x16_avx);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
> PFX(pixel_satd_4x16_avx);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
> PFX(pixel_satd_8x4_avx);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
> PFX(pixel_satd_8x4_avx);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd =
> PFX(pixel_satd_12x16_avx);
> +
> + ALL_LUMA_PU(satd, pixel_satd, avx);
> +
> + LUMA_VAR(avx);
> + p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx);
> + p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx);
> + p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx)
> +
> + ASSIGN_SA8D(avx);
> +
> + p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
> + p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
> +#endif
> }
> if (cpuMask & X265_CPU_XOP)
> {
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150818/ee56d8da/attachment.html>
More information about the x265-devel
mailing list