[x265] [PATCH Review Only] assembly code for pixel_sad_x3_32xN

Steve Borho steve at borho.org
Tue Oct 29 18:50:43 CET 2013


On Tue, Oct 29, 2013 at 6:15 AM, <yuvaraj at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> # Date 1383044811 -19800
> #      Tue Oct 29 16:36:51 2013 +0530
> # Node ID fc35a117efd17270eb15aa56aad7cc90bb7bdd35
> # Parent  e2f512dbd2424d099d9984c72bfc7d0729be25fe
> assembly code for pixel_sad_x3_32xN
>

When you mark patches as review only, it would be helpful if you described
why you believe the patch needs review or why it is unfinished.


> diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Mon Oct 28 16:13:05 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Tue Oct 29 16:36:51 2013
> +0530
> @@ -280,6 +280,11 @@
>          p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
>          p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
>          p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
> +        p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
> +        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
> +        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
> +        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
> +        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> @@ -310,6 +315,11 @@
>          p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
>          p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
>          p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
> +        p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx;
> +        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
> +        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
> +        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
> +        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
>      }
>      if (cpuMask & X265_CPU_XOP)
>      {
> diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Mon Oct 28 16:13:05 2013 +0530
> +++ b/source/common/x86/pixel.h Tue Oct 29 16:36:51 2013 +0530
> @@ -29,6 +29,11 @@
>  #define X265_I386_PIXEL_H
>
>  #define DECL_PIXELS(ret, name, suffix, args) \
> +    ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
> +    ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
> +    ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
> +    ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
> +    ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
>      ret x265_pixel_ ## name ## _16x64_ ## suffix args; \
>      ret x265_pixel_ ## name ## _16x32_ ## suffix args; \
>      ret x265_pixel_ ## name ## _16x16_ ## suffix args; \
> diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/sad-a.asm
> --- a/source/common/x86/sad-a.asm       Mon Oct 28 16:13:05 2013 +0530
> +++ b/source/common/x86/sad-a.asm       Tue Oct 29 16:36:51 2013 +0530
> @@ -1007,19 +1007,30 @@
>  ; SAD x3/x4 XMM
>
>  ;=============================================================================
>
> -%macro SAD_X3_START_1x16P_SSE2 0
> -    mova     m2, [r0]
> +%macro SAD_X3_START_1x16P_SSE2 1
> +    mova     m3, [r0 + %1]
> +%if %1 == 0
> +    pxor m0, m0
> +    pxor m1, m1
> +    pxor m2, m2
> +%endif
>  %if cpuflag(avx)
> -    psadbw   m0, m2, [r1]
> -    psadbw   m1, m2, [r2]
> -    psadbw   m2, [r3]
> +    psadbw   m4, m3, [r1 + %1]
> +    psadbw   m5, m3, [r2 + %1]
> +    psadbw   m3, [r3 + %1]
> +    paddd    m0, m4
> +    paddd    m1, m5
> +    paddd    m2, m3
>  %else
> -    movu     m0, [r1]
> -    movu     m1, [r2]
> -    movu     m3, [r3]
> -    psadbw   m0, m2
> -    psadbw   m1, m2
> -    psadbw   m2, m3
> +    movu     m4, [r1 + %1]
> +    movu     m5, [r2 + %1]
> +    movu     m6, [r3 + %1]
> +    psadbw   m4, m3
> +    psadbw   m5, m3
> +    psadbw   m6, m3
> +    paddd    m0, m4
> +    paddd    m1, m5
> +    paddd    m2, m6
>  %endif
>  %endmacro
>
> @@ -1051,7 +1062,7 @@
>  %macro SAD_X3_4x16P_SSE2 2
>  %if %1==0
>      lea  t0, [r4*3]
> -    SAD_X3_START_1x16P_SSE2
> +    SAD_X3_START_1x16P_SSE2 0
>  %else
>      SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
>  %endif
> @@ -1068,6 +1079,30 @@
>  %endif
>  %endmacro
>
> +%macro SAD_X3_4x32P_SSE2 2
> +%assign y 0
> +%rep 2
> +%if %1==0
> +    lea  t0, [r4+r4*2]
> +    SAD_X3_START_1x16P_SSE2 y
> +%else
> +    SAD_X3_1x16P_SSE2 (FENC_STRIDE*(0+(%1&1)*4) + y), (r4*0 + y)
> +%endif
> +    SAD_X3_1x16P_SSE2 (FENC_STRIDE*(1+(%1&1)*4) + y), (r4*1 + y)
> +    SAD_X3_1x16P_SSE2 (FENC_STRIDE*(2+(%1&1)*4) + y), (r4*2 + y)
> +    SAD_X3_1x16P_SSE2 (FENC_STRIDE*(3+(%1&1)*4) + y), (t0 + y)
> +%assign y y+16
> +%endrep
> +%if %1 != %2-1
> +%if (%1&1) != 0
> +    add  r0, 8*FENC_STRIDE
> +%endif
> +    lea  r1, [r1+4*r4]
> +    lea  r2, [r2+4*r4]
> +    lea  r3, [r3+4*r4]
> +%endif
> +%endmacro
> +
>  %macro SAD_X3_START_2x8P_SSE2 0
>      movq     m3, [r0]
>      movq     m0, [r1]
> @@ -1506,7 +1541,7 @@
>      SAD_X%1_4x%2P_SSE2 x, %3/4
>  %assign x x+1
>  %endrep
> -%if %3 == 64
> +%if %3 >= 24
>      SAD_X%1_END_SSE2 1
>  %else
>      SAD_X%1_END_SSE2 0
> @@ -1544,6 +1579,11 @@
>  %endmacro
>
>  INIT_XMM ssse3
> +SAD_X_SSE2  3, 32, 64, 7
> +SAD_X_SSE2  3, 32, 32, 7
> +SAD_X_SSE2  3, 32, 24, 7
> +SAD_X_SSE2  3, 32, 16, 7
> +SAD_X_SSE2  3, 32,  8, 7
>  SAD_X_SSE2  3, 16, 64, 7
>  SAD_X_SSE2  3, 16, 32, 7
>  SAD_X_SSE2  3, 16, 16, 7
> @@ -1562,6 +1602,11 @@
>  SAD_X_SSSE3 4,  8,  4
>
>  INIT_XMM avx
> +SAD_X_SSE2 3, 32, 64, 7
> +SAD_X_SSE2 3, 32, 32, 7
> +SAD_X_SSE2 3, 32, 24, 7
> +SAD_X_SSE2 3, 32, 16, 7
> +SAD_X_SSE2 3, 32,  8, 7
>  SAD_X_SSE2 3, 16, 64, 7
>  SAD_X_SSE2 3, 16, 32, 6
>  SAD_X_SSE2 3, 16, 16, 6
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131029/b2ced1f7/attachment.html>


More information about the x265-devel mailing list