[x265] [PATCH] asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN
Murugan Vairavel
murugan at multicorewareinc.com
Tue Dec 3 14:02:48 CET 2013
ignore this patch.
On Tue, Dec 3, 2013 at 1:10 PM, <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1386056379 -19800
> # Tue Dec 03 13:09:39 2013 +0530
> # Node ID 123d0c4c5683bf5c9c733830b106c538630977d8
> # Parent 5c2fcf4dfc981de6ede28e6b205e0d27c6d4608d
> asm: 10bpp code for pixel_sse_pp for 12x16, 24x32 and 64xN
>
> diff -r 5c2fcf4dfc98 -r 123d0c4c5683 source/common/x86/ssd-a.asm
> --- a/source/common/x86/ssd-a.asm Tue Dec 03 12:21:16 2013 +0530
> +++ b/source/common/x86/ssd-a.asm Tue Dec 03 13:09:39 2013 +0530
> @@ -109,6 +109,179 @@
> RET
> %endmacro
>
> +%macro SSD_TWO 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,6
> + FIX_STRIDES r1, r3
> + pxor m0, m0
> + mov r4d, %2/2
> + lea r5, [r1 * 2]
> + lea r6, [r3 * 2]
> +.loop
> + movu m1, [r0]
> + movu m2, [r0 + 16]
> + movu m3, [r0 + 32]
> + movu m4, [r0 + 48]
> + psubw m1, [r2]
> + psubw m2, [r2 + 16]
> + psubw m3, [r2 + 32]
> + psubw m4, [r2 + 48]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m0, m1
> + paddd m0, m3
> + movu m1, [r0 + 64]
> + movu m2, [r0 + 80]
> + psubw m1, [r2 + 64]
> + psubw m2, [r2 + 80]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + paddd m1, m2
> + paddd m0, m1
> +%if %1 == 64
> + movu m3, [r0 + 96]
> + movu m4, [r0 + 112]
> + psubw m3, [r2 + 96]
> + psubw m4, [r2 + 112]
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m3, m4
> + paddd m0, m3
> +%endif
> + movu m1, [r0 + r1]
> + movu m2, [r0 + r1 + 16]
> + movu m3, [r0 + r1 + 32]
> + movu m4, [r0 + r1 + 48]
> + psubw m1, [r2 + r3]
> + psubw m2, [r2 + r3 + 16]
> + psubw m3, [r2 + r3 + 32]
> + psubw m4, [r2 + r3 + 48]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m0, m1
> + paddd m0, m3
> + movu m1, [r0 + r1 + 64]
> + movu m2, [r0 + r1 + 80]
> + psubw m1, [r2 + r3 + 64]
> + psubw m2, [r2 + r3 + 80]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + paddd m1, m2
> + paddd m0, m1
> +%if %1 == 64
> + movu m3, [r0 + r1 + 96]
> + movu m4, [r0 + r1 + 112]
> + psubw m3, [r2 + r3 + 96]
> + psubw m4, [r2 + r3 + 112]
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m3, m4
> + paddd m0, m3
> +%endif
> + lea r0, [r0 + r5]
> + lea r2, [r2 + r6]
> + dec r4d
> + jnz .loop
> + HADDD m0, m5
> + movd eax, xm0
> + RET
> +%endmacro
> +%macro SSD_24 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,6
> + FIX_STRIDES r1, r3
> + pxor m0, m0
> + mov r4d, %2/2
> + lea r5, [r1 * 2]
> + lea r6, [r3 * 2]
> +.loop
> + movu m1, [r0]
> + movu m2, [r0 + 16]
> + movu m3, [r0 + 32]
> + psubw m1, [r2]
> + psubw m2, [r2 + 16]
> + psubw m3, [r2 + 32]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + paddd m1, m2
> + paddd m0, m1
> + movu m1, [r0 + r1]
> + movu m2, [r0 + r1 + 16]
> + movu m4, [r0 + r1 + 32]
> + psubw m1, [r2 + r3]
> + psubw m2, [r2 + r3 + 16]
> + psubw m4, [r2 + r3 + 32]
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m0, m1
> + paddd m0, m3
> + lea r0, [r0 + r5]
> + lea r2, [r2 + r6]
> + dec r4d
> + jnz .loop
> + HADDD m0, m5
> + movd eax, xm0
> + RET
> +%endmacro
> +%macro SSD_12 2
> +cglobal pixel_ssd_ss_%1x%2, 4,7,7
> + FIX_STRIDES r1, r3
> + pxor m0, m0
> + mov r4d, %2/4
> + lea r5, [r1 * 2]
> + lea r6, [r3 * 2]
> +.loop
> + movu m1, [r0]
> + movh m2, [r0 + 16]
> + movu m3, [r0 + r1]
> + punpcklqdq m2, [r0 + r1 + 16]
> + psubw m1, [r2]
> + movh m4, [r2 + 16]
> + psubw m3, [r2 + r3]
> + punpcklqdq m4, [r2 + r3 + 16]
> + psubw m2, m4
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + paddd m1, m2
> + paddd m0, m1
> +
> + movu m1, [r0 + r5]
> + movh m2, [r0 + r5 + 16]
> + lea r0, [r0 + r5]
> + movu m6, [r0 + r1]
> + punpcklqdq m2, [r0 + r1 + 16]
> + psubw m1, [r2 + r6]
> + movh m4, [r2 + r6 + 16]
> + lea r2, [r2 + r6]
> + psubw m6, [r2 + r3]
> + punpcklqdq m4, [r2 + r3 + 16]
> + psubw m2, m4
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m6, m6
> + paddd m1, m2
> + paddd m3, m6
> + paddd m0, m1
> + paddd m0, m3
> + lea r0, [r0 + r5]
> + lea r2, [r2 + r6]
> + dec r4d
> + jnz .loop
> + HADDD m0, m5
> + movd eax, xm0
> + RET
> +%endmacro
> INIT_MMX mmx2
> SSD_ONE 4, 4
> SSD_ONE 4, 8
> @@ -123,17 +296,24 @@
> SSD_ONE 8, 8
> SSD_ONE 8, 16
> SSD_ONE 8, 32
> +SSD_12 12, 16
> SSD_ONE 16, 4
> SSD_ONE 16, 8
> SSD_ONE 16, 12
> SSD_ONE 16, 16
> SSD_ONE 16, 32
> SSD_ONE 16, 64
> +SSD_24 24, 32
> SSD_ONE 32, 8
> SSD_ONE 32, 16
> SSD_ONE 32, 24
> SSD_ONE 32, 32
> SSD_ONE 32, 64
> +SSD_TWO 48, 64
> +SSD_TWO 64, 16
> +SSD_TWO 64, 32
> +SSD_TWO 64, 48
> +SSD_TWO 64, 64
> INIT_YMM avx2
> SSD_ONE 16, 8
> SSD_ONE 16, 16
>
--
With Regards,
Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/32b1d1a1/attachment.html>
More information about the x265-devel
mailing list