[x265] Fwd: [PATCH] intra: sse4 version of strong intra smoothing

Ximing Cheng chengximing1989 at foxmail.com
Tue Nov 21 05:58:58 CET 2017


Thanks I will fix this this weekend.


 
---Original---
From: "Praveen Tiwari"<praveen at multicorewareinc.com>
Date: 2017/11/21 12:46:00
To: "Development for x265"<x265-devel at videolan.org>;
Subject: [x265] Fwd: [PATCH] intra: sse4 version of strong intra smoothing




---------- Forwarded message ----------
From: chen <chenm003 at 163.com>
Date: Tue, Nov 21, 2017 at 10:07 AM
Subject: Re: [x265] [PATCH] intra: sse4 version of strong intra smoothing
To: Development for x265 <x265-devel at videolan.org>


>diff -r a7c2f80c18af -r 973560d58dfb source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asmMon Nov 20 14:31:22 2017 +0530 >+++ b/source/common/x86/intrapred8.asmTue Nov 21 03:10:14 2017 +0800 >@@ -22313,11 +22313,144 @@ >     mov             [r1 + 64], r3b                  ; LeftLast >     RET >  >-INIT_XMM sse4 >-cglobal intra_filter_32x32, 2,4,6 >-    mov             r2b, byte [r0 +  64]            ; topLast >-    mov             r3b, byte [r0 + 128]            ; LeftLast >- >+; this function add strong intra filter >+​​
INIT_XMM sse4 >+cglobal intra_filter_32x32, 3,8,7 >+    xor             r3d, r3d             ; R9 >+    xor             r4d, r4d             ; R10 >+    mov             r3b, byte [r0 +  64] ; topLast >+    mov             r4b, byte [r0 + 128] ; LeftLast


xor+mov = movzx, the xor (clear to zero) does not spending cycle, but affect instruction decode rate


>+ >+    ; strong intra filter is diabled >+    cmp             r2m, byte 0 >+    jz              .normal_filter32 >+    ; decide to do strong intra filter >+    xor             r5d, r5d             ; R11 >+    xor             r6d, r6d             ; RAX >+    xor             r7d, r7d             ; RDI >+    mov             r5b, byte [r0]       ; topLeft >+    mov             r6b, byte [r0 + 96]  ; leftMiddle >+    mov             r7b, byte [r0 + 32]  ; topMiddle >+ >+    ; threshold = 8 >+    mov             r2d, r3d             ; R8 >+    add             r2d, r5d             ; (topLast + topLeft) >+    shl             r7d, 1               ; 2 * topMiddle >+    sub             r2d, r7d
(A+B) - 2 * C  <==> (A-C) + (B-C)


>+    mov             r7d, r2d             ; backup r2d >+    sar             r7d, 31 >+    xor             r2d, r7d >+    sub             r2d, r7d             ; abs(r2d) >+    cmp             r2d, 8
; how about this or instruction cdq?
; abs(x-y)
mov eax, X sub eax, Y sub Y, X cmovg eax, Y




>+    ; bilinearAbove is false >+    jns             .normal_filter32 >+ >+    mov             r2d, r5d >+    add             r2d, r4d >+    shl             r6d, 1 >+    sub             r2d, r6d >+    mov             r6d, r2d >+    sar             r6d, 31 >+    xor             r2d, r6d >+    sub             r2d, r6d >+    cmp             r2d, 8 >+    ; bilinearLeft is false >+    jns             .normal_filter32 >+ >+    ; do strong intra filter shift = 6 >+    mov             r2d, r5d >+    shl             r2d, 6 >+    add             r2d, 32              ; init >+    mov             r6d, r4d >+    sub             r6w, r5w             ; deltaL size is word
partial register may stall in here


>+    mov             r7d, r3d >+    sub             r7w, r5w             ; deltaR size is word >+    movd            xmm0, r2d >+    ​​
vpbroadcastw    xmm0, xmm0
SSE4?
​This is AVX2 instruction, so ​​
intialization on top is wrong. We genrally we don't prefix xmm, ymm for native version m0, m1 will be better. 




>+    mova            xmm4, xmm0 >+ 

_______________________________________________
 x265-devel mailing list
 x265-devel at videolan.org
 https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171121/acc04a56/attachment.html>


More information about the x265-devel mailing list