[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2
chen
chenm003 at 163.com
Sun Apr 12 06:06:24 CEST 2015
在 2015-04-12 02:06:41,dave <dtyx265 at gmail.com> 写道:
On 04/11/2015 12:34 AM, chen wrote:
add data comment increment readable
Can you explain a little more about this?
[MC] no comment on any line, add data in register to more readable, eg: 'm3 = word[9 8 7 6 5 3 2 1]'
some suggest inline below
responses below
At 2015-04-11 09:58:38,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1428717487 25200
># Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
># Parent ee76a15fa312ac59549965821d9cbff03237226f
>asm: intra pred all_angs_pred_4x4 sse2
>
>This replaces c code and is backported from sse4
>The processing of modes 10 and 26 were merged and moved to after mode 2
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 9.99x 6449.98 64435.56
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 13.31x 6512.49 86709.86
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 18:58:07 2015 -0700
>@@ -1259,6 +1259,8 @@
> p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
> p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
>
>+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
>+
> p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/const-a.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -53,6 +53,10 @@
> const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
> const pb_movemask, times 16 db 0x00
> times 16 db 0xFF
>+const pb_0000000000000F0F, times 2 db 0xff, 0x00
constant name mistake
I was trying to keep it short but if you prefer pb_00000000000000000000000000FF00FF I can do that. I can do the same for the other constants.
[MC] you can write as pw_.....000FF
>+ times 14 db 0x00
Also, I should have made this 12, not 14
>+const pb_000000000000000F, db 0xff
>+ times 15 db 0x00
>
> ;; 16-bit constants
>
>@@ -94,6 +98,8 @@
> const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
> const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
> const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
>+const pw_FFFFFFF0, dw 0x00
>+ times 7 dw 0xff
>
>
> ;; 32-bit constants
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred.h Fri Apr 10 18:58:07 2015 -0700
>@@ -275,6 +275,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
>--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -34,10 +34,17 @@
>
> ; common constant with intrapred8.asm
> cextern ang_table
>+cextern pw_ang_table
> cextern tab_S1
> cextern tab_S2
> cextern tab_Si
>
>+; constants from const-a.asm
>+cextern pw_16
>+cextern pb_000000000000000F
>+cextern pb_0000000000000F0F
>+cextern pw_FFFFFFF0
>+
>
> ;-----------------------------------------------------------------------------
> ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>@@ -23006,3 +23013,780 @@
> palignr m4, m2, m1, 14
> movu [r0 + 2111 * 16], m4
> RET
>+
>+;-----------------------------------------------------------------------------
>+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal all_angs_pred_4x4, 4, 4, 8
>+
>+; mode 2
>+
>+ movh m6, [r1 + 9]
>+ movh m2, m6
mova mapping to register rename
Will do.
>+ psrldq m2, 1
>+ movd [r0], m2
>+ psrldq m2, 1
>+ movd [r0 + 4], m2
>+ psrldq m2, 1
>+ movd [r0 + 8], m2
>+ psrldq m2, 1
>+ movd [r0 + 12], m2
>+
>+; mode 10/26
>+
>+ pxor m7, m7
>+ pshufd m5, m6, 0
>+ movu [r0 + 128], m5 ;mode 10
>+
>+ movd m4, [r1 + 1]
>+ pshufd m4, m4, 0
>+ movu [r0 + 384], m4 ;mode 26
>+
>+ movd m1, [r1]
>+ punpcklbw m1, m7
>+ pshuflw m1, m1, 0x00
>+ punpcklqdq m1, m1
>+
>+ punpckldq m4, m5
>+ punpcklbw m4, m7
>+ pshuflw m2, m4, 0x00
>+ pshufhw m2, m2, 0x00
>+
>+ psubw m4, m1
>+ psraw m4, 1
>+
>+ pshufd m2, m2, q1032
>+ paddw m4, m2
>+ packuswb m4, m4
>+
>+%if ARCH_X86_64
>+ movq r2, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2, 8
>+ mov [r0 + 132], r2b
>+ shr r2, 8
>+ mov [r0 + 136], r2b
>+ shr r2, 8
>+ mov [r0 + 140], r2b
>+ shr r2, 8
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+
>+%else
>+ movd r2d, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2d, 8
>+ mov [r0 + 132], r2b
>+ shr r2d, 8
>+ mov [r0 + 136], r2b
>+ shr r2d, 8
>+ mov [r0 + 140], r2b
>+
>+ psrldq m4, 4
>+ movd r2d, m4
>+
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+%endif
>+
>+; mode 3
>+
>+ mova m2, [pw_16]
>+ lea r3, [pw_ang_table]
>+
>+ punpcklbw m6, m6
>+ psrldq m6, 1
>+ movh m1, m6
when we keep MOVH here, we can avoid memory operator in mode 11,13,15,17,etc
>+ psrldq m6, 2
>+ movh m0, m6
>+ psrldq m6, 2
>+ movh m3, m6
>+ psrldq m6, 2
>+ punpcklbw m1, m7
>+ punpcklbw m0, m7
>+ punpcklbw m3, m7
>+ punpcklbw m6, m7
>+
>+ mova m7, [r3 + 20 * 16]
offset more than 128 will generate 4-bytes address code
I will adjust r3 and use r2
>+ pmaddwd m5, m1, [r3 + 26 * 16]
>+ pmaddwd m4, m0, m7
>+
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 14 * 16]
>+ pmaddwd m6, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 16], m5
>+ movd [r0 + 68], m5 ;mode 6 row 1
>+ psrldq m5, 4
>+ movd [r0 + 76], m5 ;mode 6 row 3
>+
>+; mode 4
>+
>+ pmaddwd m4, m0, [r3 + 31 * 16]
>+ pmaddwd m6, m3, m7
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m1, [r3 + 21 * 16]
>+ pmaddwd m6, m0, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 32], m5
>+
>+; mode 5
>+
>+ pmaddwd m5, m1, [r3 + 17 * 16]
>+ pmaddwd m6, m0, [r3 + 2 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m0, [r3 + 19 * 16]
>+ pmaddwd m3, [r3 + 4 * 16]
>+
>+ packssdw m4, m3
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 48], m5
>+
>+; mode 6
>+
>+ pmaddwd m5, m1, [r3 + 13 * 16]
>+ pmaddwd m6, m0, [r3 + 7 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m6
>+ movd [r0 + 64], m5
>+ psrldq m5, 4
>+ movd [r0 + 72], m5
>+
>+; mode 7
>+
>+ pmaddwd m5, m1, [r3 + 9 * 16]
>+ pmaddwd m6, m1, [r3 + 18 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ mova m3, [r3 + 27 * 16]
>+ pmaddwd m4, m1, m3
>+ pmaddwd m0, [r3 + 4 * 16]
>+
>+ packssdw m4, m0
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 80], m5
>+
>+; mode 8
>+
>+ mova m0, [r3 + 5 * 16]
>+ pmaddwd m5, m1, m0
>+ pmaddwd m6, m1, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 15 * 16]
>+ pmaddwd m7, m1
>+
>+ packssdw m4, m7
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 96], m5
>+
>+; mode 9
>+
>+ pmaddwd m5, m1, [r3 + 2 * 16]
>+ pmaddwd m6, m1, [r3 + 4 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 6 * 16]
>+ pmaddwd m6, m1, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 112], m5
>+
>+; mode 11
>+
>+ movd m5, [r1]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
you just want to get lowest 2 Word, and both m1, m5 high QWord are zero, so can replace by PSHUFD
The high QWord of m1 is not zero but has needed values. This code changes m1 from DCCBBAA9 to CBBAA990 where each character is the r1 index of the neighboring pixel expanded to 16 bits.
[MC] as above comment, when you use MOVH on m1, the high QWord will clear to zero
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150412/02913f14/attachment-0001.html>
More information about the x265-devel
mailing list