[x265] [PATCH] intra: sse4 version of strong intrasmoothing
chen
chenm003 at 163.com
Tue Nov 28 17:40:06 CET 2017
I have a few comments.
At 2017-11-28 23:57:50, "Ximing Cheng" <chengximing1989 at foxmail.com> wrote:
>diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Wed Nov 22 22:00:48 2017 +0530
>+++ b/source/common/x86/const-a.asm Tue Nov 28 17:40:59 2017 +0800
>@@ -114,6 +114,10 @@
> const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
> const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
> const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
>+const multiH3_1, times 1 dw 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48
>+const multiH3_2, times 1 dw 41, 42, 43, 44, 45, 46, 47, 48
please check alignment issue on above constants
>+const multiH4, times 1 dw 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
>+const multiH4_1, times 1 dw 57, 58, 59, 60, 61, 62, 63, 64
> const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
> const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
> const pw_FFFFFFFFFFFFFFF0, dw 0x00
>diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Wed Nov 22 22:00:48 2017 +0530
>+++ b/source/common/x86/intrapred8.asm Tue Nov 28 17:40:59 2017 +0800
>@@ -543,6 +543,10 @@
> cextern multiH
> cextern multiH2
> cextern multiH3
>+cextern multiH3_1
>+cextern multiH3_2
>+cextern multiH4
>+cextern multiH4_1
> cextern multi_2Row
> cextern trans8_shuf
> cextern pw_planar16_mul
>@@ -22313,11 +22317,142 @@
> mov [r1 + 64], r3b ; LeftLast
> RET
>
>-INIT_XMM sse4
>-cglobal intra_filter_32x32, 2,4,6
>- mov r2b, byte [r0 + 64] ; topLast
>- mov r3b, byte [r0 + 128] ; LeftLast
>-
>+; this function add strong intra filter
>+INIT_XMM sse4
>+cglobal intra_filter_32x32, 3,8,7
>+ movzx r3d, byte [r0 + 64] ; topLast
>+ movzx r4d, byte [r0 + 128] ; LeftLast
>+
>+ ; strong intra filter is disabled
>+ cmp r2m, byte 0
>+ jz .normal_filter32
>+ ; decide to do strong intra filter
>+ movzx r5d, byte [r0] ; topLeft
>+ movzx r6d, byte [r0 + 32] ; topMiddle
>+
>+ ; threshold = 8
>+ mov r2d, r3d
>+ add r2d, r5d ; (topLast + topLeft)
>+ shl r6d, 1 ; 2 * topMiddle
>+ mov r7d, r2d
>+ sub r2d, r6d ; (topLast + topLeft) - 2 * topMiddle
>+ sub r6d, r7d ; 2 * topMiddle - (topLast + topLeft)
>+ cmovg r2d, r6d
>+ cmp r2d, 8
>+ ; bilinearAbove is false
>+ jns .normal_filter32
>+
>+ movzx r6d, byte [r0 + 96] ; leftMiddle
>+ mov r2d, r5d
>+ add r2d, r4d
>+ shl r6d, 1
>+ mov r7d, r2d
>+ sub r2d, r6d
>+ sub r6d, r7d
>+ cmovg r2d, r6d
>+ cmp r2d, 8
>+ ; bilinearLeft is false
>+ jns .normal_filter32
>+
>+ ; do strong intra filter shift = 6
>+ mov r2d, r5d
>+ shl r2d, 6
>+ add r2d, 32 ; init
>+ mov r6d, r4d
>+ sub r6d, r5d ; deltaL
>+ mov r7d, r3d
>+ sub r7d, r5d ; deltaR
>+
>+ movd m0, r2d
>+ pshuflw m0, m0, 0
>+ movlhps m0, m0
>+ mova m4, m0
>+
>+
>+ movd m1, r7d
>+ pshuflw m1, m1, 0
>+ movlhps m1, m1
>+ pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8]
>+ pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16]
what's store in high part of m2?
moreover, X * 9 = X * 1 + X * 8, so how about store X * 8 in unused m7 to reduce memory load operator (3 cycles latency)?
>+ paddw m5, m0, m2 >+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 1], m5 >+ >+ pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24] >+ pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32] >+ paddw m5, m0, m2 >+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 17], m5 >+ >+ pmullw m2, m1, [multiH3_1] ; [33 - 40]
>+ pmullw m3, m1, [multiH3_2] ; [41 - 48]
>+ paddw m5, m0, m2
>+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 33], m5 >+ >+ pmullw m2, m1, [multiH4] ; [49 - 56] >+ pmullw m1, [multiH4_1] ; [57 - 64] >+ paddw m5, m0, m2 >+ paddw m6, m4, m1 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 49], m5 >+ >+ movd m1, r6d >+ pshuflw m1, m1, 0 >+ movlhps m1, m1 >+ pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8] >+ pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16] >+ paddw m5, m0, m2 >+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 65], m5 >+ >+ pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24] >+ pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32] >+ paddw m5, m0, m2 >+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 81], m5 >+ >+ pmullw m2, m1, [multiH3_1] ; [49 - 56] >+ pmullw m3, m1, [multiH3_2] ; [57 - 64] >+ paddw m5, m0, m2 >+ paddw m6, m4, m3 >+ psraw m5, 6 >+ psraw m6, 6 >+ packuswb m5, m6 >+ movu [r1 + 97], m5 >+ >+ pmullw m2, m1, [multiH4] ; [49 - 56] >+ pmullw m1, [multiH4_1] ; [57 - 64] >+ paddw m0, m2 >+ paddw m4, m1 >+ psraw m0, 6 >+ psraw m4, 6 >+ packuswb m0, m4 >+ movu [r1 + 113], m0 >+ >+ mov [r1], r5b ; topLeft >+ mov [r1 + 64], r3b ; topLast >+ mov [r1 + 128], r4b ; LeftLast >+ RET >+ >+.normal_filter32 > ; filtering top > ; 0 to 15 > pmovzxbw m0, [r0 + 0] >@@ -22514,8 +22649,8 @@ > packuswb m1, m5 > movu [r1 + 112], m1 > >- mov [r1 + 64], r2b ; topLast >- mov [r1 + 128], r3b ; LeftLast >+ mov [r1 + 64], r3b ; topLast >+ mov [r1 + 128], r4b ; LeftLast > RET > > INIT_YMM avx2
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171129/4efbe06c/attachment-0001.html>
More information about the x265-devel
mailing list