[x265] [PATCH] asm: fix sse_ss for [16x16], [32x32] and [64x64] sse2 12bpp
chen
chenm003 at 163.com
Tue Sep 22 17:01:52 CEST 2015
I check the code before, we use sse_ss with resiYuv, it is up to 13-bits (12bpp mode, absolute value 12-bits),
so dynamic range is 12+12 + 3+4 = 31
of, course, we'd better use some debug code to confirm it again.
At 2015-09-22 13:26:03,"Ramya Sriraman" <ramya at multicorewareinc.com> wrote:
Hi min,
Thanks for the feedback.
Testbench passes an input array to this function called 'short_test_buff' which is initialized to min value -1 << 12 and max value 1 << 12 which are 13 bit and 16 bit values rspv.
My best guess for dynamic range is 16+16+3+4 = 39 . So, I think we should use QWORD.
Thank you
Regards
Ramya
On Mon, Sep 21, 2015 at 9:50 PM, chen <chenm003 at 163.com> wrote:
At 2015-09-18 12:30:06,ramya at multicorewareinc.com wrote:
># HG changeset patch
># User Ramya Sriraman <ramya at multicorewareinc.com>
># Date 1442550588 -19800
># Fri Sep 18 09:59:48 2015 +0530
># Node ID 2cca9810882147d5aece67e22403d5d40f768024
># Parent 8db83511da0b11b7347adea081269e3591029841
>asm: fix sse_ss for [16x16], [32x32] and [64x64] sse2 12bpp
>
>diff -r 8db83511da0b -r 2cca98108821 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Mon Sep 14 09:28:07 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Fri Sep 18 09:59:48 2015 +0530
>@@ -1002,10 +1002,12 @@
> p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
> p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
> p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
>-#if X265_DEPTH <= 10
>- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
>- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
>-#endif
>+ p.cu[BLOCK_4x4].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_4x4_mmx2);
>+ p.cu[BLOCK_8x8].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_8x8_sse2);
>+ p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_sse2);
>+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_sse2);
>+ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_sse2);
>+
> p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
> p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
> p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
>diff -r 8db83511da0b -r 2cca98108821 source/common/x86/ssd-a.asm
>--- a/source/common/x86/ssd-a.asm Mon Sep 14 09:28:07 2015 +0530
>+++ b/source/common/x86/ssd-a.asm Fri Sep 18 09:59:48 2015 +0530
>@@ -181,6 +181,197 @@
> RET
> %endmacro
>
>+;Function to find ssd for 8x16 block, sse2, 12 bit depth
>+;Defined sepeartely to be called from SSD_ONE_SS_16 macro
>+INIT_XMM sse2
>+cglobal ssd_ss_8x16
>+ pxor m8, m8
>+ mov r4d, 4
>+.loop:
>+ movu m0, [r0]
>+ movu m1, [r0 + mmsize]
>+ movu m2, [r0 + r1]
>+ movu m3, [r0 + r1+ mmsize]
>+ movu m4, [r2]
>+ movu m5, [r2 + mmsize]
>+ movu m6, [r2 + r3]
>+ movu m7, [r2 + r3 + mmsize]
>+ psubw m0, m4
>+ psubw m1, m5
>+ psubw m2, m6
>+ psubw m3, m7
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+ pmaddwd m0, m0
>+ pmaddwd m1, m1
>+ pmaddwd m2, m2
>+ pmaddwd m3, m3
>+ paddd m2, m3
>+ paddd m0, m1
>+ paddd m0, m2
paddd m8, m2 <-- this style broken dependency link, may faster
>+ paddd m8, m0
>+ dec r4d
>+ jnz .loop
>+
>+ mova m4, m8
>+ pxor m5, m5
>+ punpckldq m8, m5
>+ punpckhdq m4, m5
>+ paddq m4, m8
>+ movhlps m5, m4
>+ paddq m4, m5
>+ paddq m9, m4
in this case, dynamic range is
12+12 + 3+4 = 31, so we didn't need QWORD operators (3+4 because 8x16)
another problem, this function as common function to share with other functions, so we didn't need sum all elements into one QWORD, we can do it in last stage.
same comment in below code
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150922/cf1b6dd8/attachment.html>
More information about the x265-devel
mailing list