[x265] [PATCH 2 of 2] asm: improve saoCuStatsBO by split loop path and replace PEXTRB

Min Chen chenm003 at 163.com
Fri Dec 11 01:36:44 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449793237 21600
# Node ID cf0ac10f6dffecc9c9096163f570365c1b0a4ffa
# Parent  6135ca57edd80ce619a39c542823e6cd09533b1b
asm: improve saoCuStatsBO by split loop path and replace PEXTRB
---
 source/common/x86/loopfilter.asm |   35 ++++++++++++++++++++++++++++-------
 1 files changed, 28 insertions(+), 7 deletions(-)

diff -r 6135ca57edd8 -r cf0ac10f6dff source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Dec 10 18:20:34 2015 -0600
+++ b/source/common/x86/loopfilter.asm	Thu Dec 10 18:20:37 2015 -0600
@@ -1997,14 +1997,13 @@
 ;--------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,2
+cglobal saoCuStatsBO, 7,13,2
     mova        m0, [pb_124]
-    xor         r7d, r7d
     add         r5, 4
     add         r6, 4
 
 .loopH:
-    mov         r10, r0
+    mov         r12, r0
     mov         r11, r1
     mov         r9d, r3d
 
@@ -2013,10 +2012,32 @@
     psrlw       m1, 1                   ; rec[x] >> boShift
     pand        m1, m0
 
+    cmp         r9d, 8
+    jle        .proc8
+
+    movq        r10, m1
 %assign x 0
-%rep 16
-    pextrb      r7d, m1, x
-    movsx       r8d, word [r10 + x*2]   ; diff[x]
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
+
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
+%assign x x+1
+%endrep
+    movhlps     m1, m1
+    sub         r9d, 8
+    add         r12, 8*2
+
+.proc8:
+    movq        r10, m1
+%assign x 0
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
+
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
     inc         dword  [r6 + r7]        ; count[classIdx]++
     add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
     dec         r9d
@@ -2024,7 +2045,7 @@
 %assign x x+1
 %endrep
 
-    add         r10, 16*2
+    add         r12, 8*2
     add         r11, 16
     jmp        .loopL
 



More information about the x265-devel mailing list