[x265] [PATCH] asm: fix sse_ss for [16x16], [32x32] and [64x64] sse2 12bpp
chen
chenm003 at 163.com
Mon Sep 21 18:20:55 CEST 2015
At 2015-09-18 12:30:06,ramya at multicorewareinc.com wrote:
># HG changeset patch
># User Ramya Sriraman <ramya at multicorewareinc.com>
># Date 1442550588 -19800
># Fri Sep 18 09:59:48 2015 +0530
># Node ID 2cca9810882147d5aece67e22403d5d40f768024
># Parent 8db83511da0b11b7347adea081269e3591029841
>asm: fix sse_ss for [16x16], [32x32] and [64x64] sse2 12bpp
>
>diff -r 8db83511da0b -r 2cca98108821 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Mon Sep 14 09:28:07 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Fri Sep 18 09:59:48 2015 +0530
>@@ -1002,10 +1002,12 @@
> p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
> p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
> p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
>-#if X265_DEPTH <= 10
>- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
>- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
>-#endif
>+ p.cu[BLOCK_4x4].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_4x4_mmx2);
>+ p.cu[BLOCK_8x8].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_8x8_sse2);
>+ p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_sse2);
>+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_sse2);
>+ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_sse2);
>+
> p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
> p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
> p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
>diff -r 8db83511da0b -r 2cca98108821 source/common/x86/ssd-a.asm
>--- a/source/common/x86/ssd-a.asm Mon Sep 14 09:28:07 2015 +0530
>+++ b/source/common/x86/ssd-a.asm Fri Sep 18 09:59:48 2015 +0530
>@@ -181,6 +181,197 @@
> RET
> %endmacro
>
>+;Function to find ssd for 8x16 block, sse2, 12 bit depth
>+;Defined sepeartely to be called from SSD_ONE_SS_16 macro
>+INIT_XMM sse2
>+cglobal ssd_ss_8x16
>+ pxor m8, m8
>+ mov r4d, 4
>+.loop:
>+ movu m0, [r0]
>+ movu m1, [r0 + mmsize]
>+ movu m2, [r0 + r1]
>+ movu m3, [r0 + r1+ mmsize]
>+ movu m4, [r2]
>+ movu m5, [r2 + mmsize]
>+ movu m6, [r2 + r3]
>+ movu m7, [r2 + r3 + mmsize]
>+ psubw m0, m4
>+ psubw m1, m5
>+ psubw m2, m6
>+ psubw m3, m7
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+ pmaddwd m0, m0
>+ pmaddwd m1, m1
>+ pmaddwd m2, m2
>+ pmaddwd m3, m3
>+ paddd m2, m3
>+ paddd m0, m1
>+ paddd m0, m2
paddd m8, m2 <-- this style broken dependency link, may faster
>+ paddd m8, m0
>+ dec r4d
>+ jnz .loop
>+
>+ mova m4, m8
>+ pxor m5, m5
>+ punpckldq m8, m5
>+ punpckhdq m4, m5
>+ paddq m4, m8
>+ movhlps m5, m4
>+ paddq m4, m5
>+ paddq m9, m4
in this case, dynamic range is
12+12 + 3+4 = 31, so we didn't need QWORD operators (3+4 because 8x16)
another problem, this function as common function to share with other functions, so we didn't need sum all elements into one QWORD, we can do it in last stage.
same comment in below code
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150922/ad5457bf/attachment.html>
More information about the x265-devel
mailing list