[x265] [PATCH] asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN

Murugan Vairavel murugan at multicorewareinc.com
Tue Dec 3 14:02:48 CET 2013


ignore this patch.



On Tue, Dec 3, 2013 at 1:10 PM, <murugan at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1386056379 -19800
> #      Tue Dec 03 13:09:39 2013 +0530
> # Node ID 123d0c4c5683bf5c9c733830b106c538630977d8
> # Parent  5c2fcf4dfc981de6ede28e6b205e0d27c6d4608d
> asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN
>
> diff -r 5c2fcf4dfc98 -r 123d0c4c5683 source/common/x86/ssd-a.asm
> --- a/source/common/x86/ssd-a.asm       Tue Dec 03 12:21:16 2013 +0530
> +++ b/source/common/x86/ssd-a.asm       Tue Dec 03 13:09:39 2013 +0530
> @@ -109,6 +109,179 @@
>      RET
>  %endmacro
>
> +%macro SSD_TWO 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,6
> +    FIX_STRIDES r1, r3
> +    pxor    m0,  m0
> +    mov     r4d, %2/2
> +    lea     r5,  [r1 * 2]
> +    lea     r6,  [r3 * 2]
> +.loop
> +    movu    m1,  [r0]
> +    movu    m2,  [r0 + 16]
> +    movu    m3,  [r0 + 32]
> +    movu    m4,  [r0 + 48]
> +    psubw   m1,  [r2]
> +    psubw   m2,  [r2 + 16]
> +    psubw   m3,  [r2 + 32]
> +    psubw   m4,  [r2 + 48]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    pmaddwd m3,  m3
> +    pmaddwd m4,  m4
> +    paddd   m1,  m2
> +    paddd   m3,  m4
> +    paddd   m0,  m1
> +    paddd   m0,  m3
> +    movu    m1,  [r0 + 64]
> +    movu    m2,  [r0 + 80]
> +    psubw   m1,  [r2 + 64]
> +    psubw   m2,  [r2 + 80]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    paddd   m1,  m2
> +    paddd   m0,  m1
> +%if %1 == 64
> +    movu    m3,  [r0 + 96]
> +    movu    m4,  [r0 + 112]
> +    psubw   m3,  [r2 + 96]
> +    psubw   m4,  [r2 + 112]
> +    pmaddwd m3,  m3
> +    pmaddwd m4,  m4
> +    paddd   m3,  m4
> +    paddd   m0,  m3
> +%endif
> +    movu    m1,  [r0 + r1]
> +    movu    m2,  [r0 + r1 + 16]
> +    movu    m3,  [r0 + r1 + 32]
> +    movu    m4,  [r0 + r1 + 48]
> +    psubw   m1,  [r2 + r3]
> +    psubw   m2,  [r2 + r3 + 16]
> +    psubw   m3,  [r2 + r3 + 32]
> +    psubw   m4,  [r2 + r3 + 48]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    pmaddwd m3,  m3
> +    pmaddwd m4,  m4
> +    paddd   m1,  m2
> +    paddd   m3,  m4
> +    paddd   m0,  m1
> +    paddd   m0,  m3
> +    movu    m1,  [r0 + r1 + 64]
> +    movu    m2,  [r0 + r1 + 80]
> +    psubw   m1,  [r2 + r3 + 64]
> +    psubw   m2,  [r2 + r3 + 80]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    paddd   m1,  m2
> +    paddd   m0,  m1
> +%if %1 == 64
> +    movu    m3,  [r0 + r1 + 96]
> +    movu    m4,  [r0 + r1 + 112]
> +    psubw   m3,  [r2 + r3 + 96]
> +    psubw   m4,  [r2 + r3 + 112]
> +    pmaddwd m3,  m3
> +    pmaddwd m4,  m4
> +    paddd   m3,  m4
> +    paddd   m0,  m3
> +%endif
> +    lea     r0,  [r0 + r5]
> +    lea     r2,  [r2 + r6]
> +    dec     r4d
> +    jnz  .loop
> +    HADDD   m0, m5
> +    movd   eax, xm0
> +    RET
> +%endmacro
> +%macro SSD_24 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,6
> +    FIX_STRIDES r1, r3
> +    pxor    m0,  m0
> +    mov     r4d, %2/2
> +    lea     r5,  [r1 * 2]
> +    lea     r6,  [r3 * 2]
> +.loop
> +    movu    m1,  [r0]
> +    movu    m2,  [r0 + 16]
> +    movu    m3,  [r0 + 32]
> +    psubw   m1,  [r2]
> +    psubw   m2,  [r2 + 16]
> +    psubw   m3,  [r2 + 32]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    pmaddwd m3,  m3
> +    paddd   m1,  m2
> +    paddd   m0,  m1
> +    movu    m1,  [r0 + r1]
> +    movu    m2,  [r0 + r1 + 16]
> +    movu    m4,  [r0 + r1 + 32]
> +    psubw   m1,  [r2 + r3]
> +    psubw   m2,  [r2 + r3 + 16]
> +    psubw   m4,  [r2 + r3 + 32]
> +    pmaddwd m1,  m1
> +    pmaddwd m2,  m2
> +    pmaddwd m4,  m4
> +    paddd   m1,  m2
> +    paddd   m3,  m4
> +    paddd   m0,  m1
> +    paddd   m0,  m3
> +    lea     r0,  [r0 + r5]
> +    lea     r2,  [r2 + r6]
> +    dec     r4d
> +    jnz  .loop
> +    HADDD   m0, m5
> +    movd   eax, xm0
> +    RET
> +%endmacro
> +%macro SSD_12 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,7
> +    FIX_STRIDES r1, r3
> +    pxor    m0,  m0
> +    mov     r4d, %2/4
> +    lea     r5,  [r1 * 2]
> +    lea     r6,  [r3 * 2]
> +.loop
> +    movu        m1,  [r0]
> +    movh        m2,  [r0 + 16]
> +    movu        m3,  [r0 + r1]
> +    punpcklqdq  m2,  [r0 + r1 + 16]
> +    psubw       m1,  [r2]
> +    movh        m4,  [r2 + 16]
> +    psubw       m3,  [r2 + r3]
> +    punpcklqdq  m4,  [r2 + r3 + 16]
> +    psubw       m2,  m4
> +    pmaddwd     m1,  m1
> +    pmaddwd     m2,  m2
> +    pmaddwd     m3,  m3
> +    paddd       m1,  m2
> +    paddd       m0,  m1
> +
> +    movu        m1,  [r0 + r5]
> +    movh        m2,  [r0 + r5 + 16]
> +    lea         r0,  [r0 + r5]
> +    movu        m6,  [r0 + r1]
> +    punpcklqdq  m2,  [r0 + r1 + 16]
> +    psubw       m1,  [r2 + r6]
> +    movh        m4,  [r2 + r6 + 16]
> +    lea         r2,  [r2 + r6]
> +    psubw       m6,  [r2 + r3]
> +    punpcklqdq  m4,  [r2 + r3 + 16]
> +    psubw       m2,  m4
> +    pmaddwd     m1,  m1
> +    pmaddwd     m2,  m2
> +    pmaddwd     m6,  m6
> +    paddd       m1,  m2
> +    paddd       m3,  m6
> +    paddd       m0,  m1
> +    paddd       m0,  m3
> +    lea         r0,  [r0 + r5]
> +    lea         r2,  [r2 + r6]
> +    dec         r4d
> +    jnz     .loop
> +    HADDD   m0, m5
> +    movd   eax, xm0
> +    RET
> +%endmacro
>  INIT_MMX mmx2
>  SSD_ONE     4,  4
>  SSD_ONE     4,  8
> @@ -123,17 +296,24 @@
>  SSD_ONE     8,  8
>  SSD_ONE     8, 16
>  SSD_ONE     8, 32
> +SSD_12     12, 16
>  SSD_ONE    16,  4
>  SSD_ONE    16,  8
>  SSD_ONE    16, 12
>  SSD_ONE    16, 16
>  SSD_ONE    16, 32
>  SSD_ONE    16, 64
> +SSD_24     24, 32
>  SSD_ONE    32,  8
>  SSD_ONE    32, 16
>  SSD_ONE    32, 24
>  SSD_ONE    32, 32
>  SSD_ONE    32, 64
> +SSD_TWO    48, 64
> +SSD_TWO    64, 16
> +SSD_TWO    64, 32
> +SSD_TWO    64, 48
> +SSD_TWO    64, 64
>  INIT_YMM avx2
>  SSD_ONE    16,  8
>  SSD_ONE    16, 16
>



-- 
With Regards,

Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/32b1d1a1/attachment.html>


More information about the x265-devel mailing list