[x265] [PATCH] asm: avx2 code for intra_dc_32x32

Steve Borho steve at borho.org
Fri Apr 3 17:49:24 CEST 2015


On 04/03, dnyaneshwar at multicorewareinc.com wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1428055909 -19800
> #      Fri Apr 03 15:41:49 2015 +0530
> # Node ID 83f44b5a99a1157683d63a18d05297a58437e7a3
> # Parent  cef7834897bc0d53981e5dfe8790bc207deb7346
> asm: avx2 code for intra_dc_32x32

these three are queued, plus the const-a cleanup

> AVX2:
> intra_dc_32x32[f=0]     23.17x   435.66          10093.78
> 
> SSE4:
> intra_dc_32x32[f=0]     14.36x   703.46          10100.78
> 
> diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Fri Apr 03 11:35:53 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Fri Apr 03 15:41:49 2015 +0530
> @@ -1471,6 +1471,8 @@
>  #if X86_64
>      if (cpuMask & X265_CPU_AVX2)
>      {
> +        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
> +
>          p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
>          p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
>  
> diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred.h
> --- a/source/common/x86/intrapred.h	Fri Apr 03 11:35:53 2015 +0530
> +++ b/source/common/x86/intrapred.h	Fri Apr 03 15:41:49 2015 +0530
> @@ -34,6 +34,7 @@
>  void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
>  void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
>  void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
> +void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
>  
>  void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
>  void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
> diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred8.asm
> --- a/source/common/x86/intrapred8.asm	Fri Apr 03 11:35:53 2015 +0530
> +++ b/source/common/x86/intrapred8.asm	Fri Apr 03 15:41:49 2015 +0530
> @@ -573,6 +573,7 @@
>  cextern pw_31
>  cextern pw_32
>  cextern pw_257
> +cextern pw_512
>  cextern pw_1024
>  cextern pw_4096
>  cextern pw_00ff
> @@ -2251,6 +2252,69 @@
>  
>      RET
>  
> +;---------------------------------------------------------------------------------------------
> +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
> +;---------------------------------------------------------------------------------------------
> +%if ARCH_X86_64 == 1
> +INIT_YMM avx2
> +cglobal intra_pred_dc32, 3, 4, 3
> +    lea             r3, [r1 * 3]
> +    pxor            m0, m0
> +    movu            m1, [r2 + 1]
> +    movu            m2, [r2 + 65]
> +    psadbw          m1, m0
> +    psadbw          m2, m0
> +    paddw           m1, m2
> +    vextracti128    xm2, m1, 1
> +    paddw           m1, m2
> +    pshufd          m2, m1, 2
> +    paddw           m1, m2
> +
> +    pmulhrsw        m1, [pw_512]    ; sum = (sum + 32) / 64
> +    vpbroadcastb    m1, xm1         ; m1 = byte [dc_val ...]
> +
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    lea             r0, [r0 + 4 * r1]
> +    movu            [r0 + r1 * 0], m1
> +    movu            [r0 + r1 * 1], m1
> +    movu            [r0 + r1 * 2], m1
> +    movu            [r0 + r3 * 1], m1
> +    RET
> +%endif ;; ARCH_X86_64 == 1
> +
>  ;---------------------------------------------------------------------------------------
>  ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
>  ;---------------------------------------------------------------------------------------
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list